#ifndef CUFFTDX_FFT_4096_FP32_INV_PTX_HPP
#define CUFFTDX_FFT_4096_FP32_INV_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<303, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<952>;
.reg .b32 r<22>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 14;
mov.u32 r3, %32;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f65, %35, %56;
add.f32 f66, %36, %58;
sub.f32 f67, %35, %56;
sub.f32 f68, %36, %58;
add.f32 f69, %45, %67;
add.f32 f70, %47, %68;
sub.f32 f71, %45, %67;
sub.f32 f72, %47, %68;
add.f32 f73, f65, f69;
add.f32 f74, f66, f70;
sub.f32 f75, f65, f69;
sub.f32 f76, f66, f70;
sub.f32 f77, f67, f72;
add.f32 f78, f68, f71;
add.f32 f79, f67, f72;
sub.f32 f80, f68, f71;
add.f32 f81, %40, %61;
add.f32 f82, %42, %63;
sub.f32 f83, %40, %61;
sub.f32 f84, %42, %63;
add.f32 f85, %51, %72;
add.f32 f86, %52, %74;
sub.f32 f87, %51, %72;
sub.f32 f88, %52, %74;
add.f32 f89, f81, f85;
add.f32 f90, f82, f86;
sub.f32 f91, f81, f85;
sub.f32 f92, f82, f86;
sub.f32 f93, f83, f88;
add.f32 f94, f84, f87;
add.f32 f95, f83, f88;
sub.f32 f96, f84, f87;
mul.f32 f97, f93, 0f3F3504F3;
mul.f32 f98, f94, 0f3F3504F3;
sub.f32 f99, f97, f98;
add.f32 f100, f97, f98;
mul.f32 f101, f95, 0fBF3504F3;
mul.f32 f102, f96, 0f3F3504F3;
sub.f32 f103, f101, f102;
mul.f32 f104, f96, 0fBF3504F3;
fma.rn.f32 f105, f95, 0f3F3504F3, f104;
add.f32 f106, f73, f89;
add.f32 f107, f74, f90;
sub.f32 f108, f73, f89;
sub.f32 f109, f74, f90;
add.f32 f110, f77, f99;
add.f32 f111, f78, f100;
sub.f32 f112, f77, f99;
sub.f32 f113, f78, f100;
sub.f32 f114, f75, f92;
add.f32 f115, f76, f91;
add.f32 f116, f75, f92;
sub.f32 f117, f76, f91;
add.f32 f118, f79, f103;
add.f32 f119, f80, f105;
sub.f32 f120, f79, f103;
sub.f32 f121, f80, f105;
add.f32 f122, %37, %59;
add.f32 f123, %39, %60;
sub.f32 f124, %37, %59;
sub.f32 f125, %39, %60;
add.f32 f126, %48, %69;
add.f32 f127, %50, %71;
sub.f32 f128, %48, %69;
sub.f32 f129, %50, %71;
add.f32 f130, f122, f126;
add.f32 f131, f123, f127;
sub.f32 f132, f122, f126;
sub.f32 f133, f123, f127;
sub.f32 f134, f124, f129;
add.f32 f135, f125, f128;
add.f32 f136, f124, f129;
sub.f32 f137, f125, f128;
add.f32 f138, %43, %64;
add.f32 f139, %44, %66;
sub.f32 f140, %43, %64;
sub.f32 f141, %44, %66;
add.f32 f142, %53, %75;
add.f32 f143, %55, %76;
sub.f32 f144, %53, %75;
sub.f32 f145, %55, %76;
add.f32 f146, f138, f142;
add.f32 f147, f139, f143;
sub.f32 f148, f138, f142;
sub.f32 f149, f139, f143;
sub.f32 f150, f140, f145;
add.f32 f151, f141, f144;
add.f32 f152, f140, f145;
sub.f32 f153, f141, f144;
mul.f32 f154, f150, 0f3F3504F3;
mul.f32 f155, f151, 0f3F3504F3;
sub.f32 f156, f154, f155;
add.f32 f157, f154, f155;
mul.f32 f158, f152, 0fBF3504F3;
mul.f32 f159, f153, 0f3F3504F3;
sub.f32 f160, f158, f159;
mul.f32 f161, f153, 0fBF3504F3;
fma.rn.f32 f162, f152, 0f3F3504F3, f161;
add.f32 f163, f130, f146;
add.f32 f164, f131, f147;
sub.f32 f165, f130, f146;
sub.f32 f166, f131, f147;
add.f32 f167, f134, f156;
add.f32 f168, f135, f157;
sub.f32 f169, f134, f156;
sub.f32 f170, f135, f157;
sub.f32 f171, f132, f149;
add.f32 f172, f133, f148;
add.f32 f173, f132, f149;
sub.f32 f174, f133, f148;
add.f32 f175, f136, f160;
add.f32 f176, f137, f162;
sub.f32 f177, f136, f160;
sub.f32 f178, f137, f162;
mul.f32 f179, f167, 0f3F6C835E;
mul.f32 f180, f168, 0f3EC3EF15;
sub.f32 f181, f179, f180;
mul.f32 f182, f168, 0f3F6C835E;
fma.rn.f32 f183, f167, 0f3EC3EF15, f182;
mul.f32 f184, f171, 0f3F3504F3;
mul.f32 f185, f172, 0f3F3504F3;
sub.f32 f186, f184, f185;
add.f32 f187, f184, f185;
mul.f32 f188, f175, 0f3EC3EF15;
mul.f32 f189, f176, 0f3F6C835E;
sub.f32 f190, f188, f189;
mul.f32 f191, f176, 0f3EC3EF15;
fma.rn.f32 f192, f175, 0f3F6C835E, f191;
mul.f32 f193, f169, 0fBEC3EF15;
mul.f32 f194, f170, 0f3F6C835E;
sub.f32 f195, f193, f194;
mul.f32 f196, f170, 0fBEC3EF15;
fma.rn.f32 f197, f169, 0f3F6C835E, f196;
mul.f32 f198, f173, 0fBF3504F3;
mul.f32 f199, f174, 0f3F3504F3;
sub.f32 f200, f198, f199;
mul.f32 f201, f174, 0fBF3504F3;
fma.rn.f32 f202, f173, 0f3F3504F3, f201;
mul.f32 f203, f177, 0fBF6C835E;
mul.f32 f204, f178, 0f3EC3EF15;
sub.f32 f205, f203, f204;
mul.f32 f206, f178, 0fBF6C835E;
fma.rn.f32 f207, f177, 0f3EC3EF15, f206;
add.f32 f208, f106, f163;
add.f32 f209, f107, f164;
sub.f32 f210, f106, f163;
sub.f32 f211, f107, f164;
add.f32 f212, f110, f181;
add.f32 f213, f111, f183;
sub.f32 f214, f110, f181;
sub.f32 f215, f111, f183;
add.f32 f216, f114, f186;
add.f32 f217, f115, f187;
sub.f32 f218, f114, f186;
sub.f32 f219, f115, f187;
add.f32 f220, f118, f190;
add.f32 f221, f119, f192;
sub.f32 f222, f118, f190;
sub.f32 f223, f119, f192;
sub.f32 f224, f108, f166;
add.f32 f225, f109, f165;
add.f32 f226, f108, f166;
sub.f32 f227, f109, f165;
add.f32 f228, f112, f195;
add.f32 f229, f113, f197;
sub.f32 f230, f112, f195;
sub.f32 f231, f113, f197;
add.f32 f232, f116, f200;
add.f32 f233, f117, f202;
sub.f32 f234, f116, f200;
sub.f32 f235, f117, f202;
add.f32 f236, f120, f205;
add.f32 f237, f121, f207;
sub.f32 f238, f120, f205;
sub.f32 f239, f121, f207;
and.b32 r6, r5, 255;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 2040;
mov.u64 rd4, %33;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f240, f241}, [rd5];
mul.f32 f244, f213, f241;
fma.rn.f32 f245, f240, f212, f244;
mul.f32 f246, f212, f241;
mul.f32 f247, f240, f213;
sub.f32 f248, f247, f246;
mul.f32 f249, f240, f240;
mul.f32 f250, f241, f241;
sub.f32 f251, f249, f250;
mul.f32 f252, f241, f240;
fma.rn.f32 f253, f241, f240, f252;
mul.f32 f254, f217, f253;
fma.rn.f32 f255, f251, f216, f254;
mul.f32 f256, f216, f253;
mul.f32 f257, f251, f217;
sub.f32 f258, f257, f256;
mul.f32 f259, f240, f251;
mul.f32 f260, f241, f253;
sub.f32 f261, f259, f260;
mul.f32 f262, f240, f253;
fma.rn.f32 f263, f241, f251, f262;
mul.f32 f264, f221, f263;
fma.rn.f32 f265, f261, f220, f264;
mul.f32 f266, f220, f263;
mul.f32 f267, f261, f221;
sub.f32 f268, f267, f266;
mul.f32 f269, f240, f261;
mul.f32 f270, f241, f263;
sub.f32 f271, f269, f270;
mul.f32 f272, f240, f263;
fma.rn.f32 f273, f241, f261, f272;
mul.f32 f274, f225, f273;
fma.rn.f32 f275, f271, f224, f274;
mul.f32 f276, f224, f273;
mul.f32 f277, f271, f225;
sub.f32 f278, f277, f276;
mul.f32 f279, f240, f271;
mul.f32 f280, f241, f273;
sub.f32 f281, f279, f280;
mul.f32 f282, f240, f273;
fma.rn.f32 f283, f241, f271, f282;
mul.f32 f284, f229, f283;
fma.rn.f32 f285, f281, f228, f284;
mul.f32 f286, f228, f283;
mul.f32 f287, f281, f229;
sub.f32 f288, f287, f286;
mul.f32 f289, f240, f281;
mul.f32 f290, f241, f283;
sub.f32 f291, f289, f290;
mul.f32 f292, f240, f283;
fma.rn.f32 f293, f241, f281, f292;
mul.f32 f294, f233, f293;
fma.rn.f32 f295, f291, f232, f294;
mul.f32 f296, f232, f293;
mul.f32 f297, f291, f233;
sub.f32 f298, f297, f296;
mul.f32 f299, f240, f291;
mul.f32 f300, f241, f293;
sub.f32 f301, f299, f300;
mul.f32 f302, f240, f293;
fma.rn.f32 f303, f241, f291, f302;
mul.f32 f304, f237, f303;
fma.rn.f32 f305, f301, f236, f304;
mul.f32 f306, f236, f303;
mul.f32 f307, f301, f237;
sub.f32 f308, f307, f306;
mul.f32 f309, f240, f301;
mul.f32 f310, f241, f303;
sub.f32 f311, f309, f310;
mul.f32 f312, f240, f303;
fma.rn.f32 f313, f241, f301, f312;
mul.f32 f314, f211, f313;
fma.rn.f32 f315, f311, f210, f314;
mul.f32 f316, f210, f313;
mul.f32 f317, f311, f211;
sub.f32 f318, f317, f316;
mul.f32 f319, f240, f311;
mul.f32 f320, f241, f313;
sub.f32 f321, f319, f320;
mul.f32 f322, f240, f313;
fma.rn.f32 f323, f241, f311, f322;
mul.f32 f324, f215, f323;
fma.rn.f32 f325, f321, f214, f324;
mul.f32 f326, f214, f323;
mul.f32 f327, f321, f215;
sub.f32 f328, f327, f326;
mul.f32 f329, f240, f321;
mul.f32 f330, f241, f323;
sub.f32 f331, f329, f330;
mul.f32 f332, f240, f323;
fma.rn.f32 f333, f241, f321, f332;
mul.f32 f334, f219, f333;
fma.rn.f32 f335, f331, f218, f334;
mul.f32 f336, f218, f333;
mul.f32 f337, f331, f219;
sub.f32 f338, f337, f336;
mul.f32 f339, f240, f331;
mul.f32 f340, f241, f333;
sub.f32 f341, f339, f340;
mul.f32 f342, f240, f333;
fma.rn.f32 f343, f241, f331, f342;
mul.f32 f344, f223, f343;
fma.rn.f32 f345, f341, f222, f344;
mul.f32 f346, f222, f343;
mul.f32 f347, f341, f223;
sub.f32 f348, f347, f346;
mul.f32 f349, f240, f341;
mul.f32 f350, f241, f343;
sub.f32 f351, f349, f350;
mul.f32 f352, f240, f343;
fma.rn.f32 f353, f241, f341, f352;
mul.f32 f354, f227, f353;
fma.rn.f32 f355, f351, f226, f354;
mul.f32 f356, f226, f353;
mul.f32 f357, f351, f227;
sub.f32 f358, f357, f356;
mul.f32 f359, f240, f351;
mul.f32 f360, f241, f353;
sub.f32 f361, f359, f360;
mul.f32 f362, f240, f353;
fma.rn.f32 f363, f241, f351, f362;
mul.f32 f364, f231, f363;
fma.rn.f32 f365, f361, f230, f364;
mul.f32 f366, f230, f363;
mul.f32 f367, f361, f231;
sub.f32 f368, f367, f366;
mul.f32 f369, f240, f361;
mul.f32 f370, f241, f363;
sub.f32 f371, f369, f370;
mul.f32 f372, f240, f363;
fma.rn.f32 f373, f241, f361, f372;
mul.f32 f374, f235, f373;
fma.rn.f32 f375, f371, f234, f374;
mul.f32 f376, f234, f373;
mul.f32 f377, f371, f235;
sub.f32 f378, f377, f376;
mul.f32 f379, f240, f371;
mul.f32 f380, f241, f373;
sub.f32 f381, f379, f380;
mul.f32 f382, f240, f373;
fma.rn.f32 f383, f241, f371, f382;
mul.f32 f384, f239, f383;
fma.rn.f32 f385, f381, f238, f384;
mul.f32 f386, f238, f383;
mul.f32 f387, f381, f239;
sub.f32 f388, f387, f386;
shl.b32 r8, r5, 6;
and.b32 r9, r8, -16384;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 16320;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f208, f245, f255, f265};
st.shared.v4.f32 [r12+16], {f275, f285, f295, f305};
st.shared.v4.f32 [r12+32], {f315, f325, f335, f345};
st.shared.v4.f32 [r12+48], {f355, f365, f375, f385};
barrier.sync 0;
mad.lo.s32 r13, r6, -60, r12;
ld.shared.f32 f389, [r13];
ld.shared.f32 f390, [r13+1024];
ld.shared.f32 f391, [r13+2048];
ld.shared.f32 f392, [r13+3072];
ld.shared.f32 f393, [r13+4096];
ld.shared.f32 f394, [r13+5120];
ld.shared.f32 f395, [r13+6144];
ld.shared.f32 f396, [r13+7168];
ld.shared.f32 f397, [r13+8192];
ld.shared.f32 f398, [r13+9216];
ld.shared.f32 f399, [r13+10240];
ld.shared.f32 f400, [r13+11264];
ld.shared.f32 f401, [r13+12288];
ld.shared.f32 f402, [r13+13312];
ld.shared.f32 f403, [r13+14336];
ld.shared.f32 f404, [r13+15360];
barrier.sync 0;
st.shared.v4.f32 [r12], {f209, f248, f258, f268};
st.shared.v4.f32 [r12+16], {f278, f288, f298, f308};
st.shared.v4.f32 [r12+32], {f318, f328, f338, f348};
st.shared.v4.f32 [r12+48], {f358, f368, f378, f388};
barrier.sync 0;
ld.shared.f32 f405, [r13];
ld.shared.f32 f406, [r13+1024];
ld.shared.f32 f407, [r13+2048];
ld.shared.f32 f408, [r13+3072];
ld.shared.f32 f409, [r13+4096];
ld.shared.f32 f410, [r13+5120];
ld.shared.f32 f411, [r13+6144];
ld.shared.f32 f412, [r13+7168];
ld.shared.f32 f413, [r13+8192];
ld.shared.f32 f414, [r13+9216];
ld.shared.f32 f415, [r13+10240];
ld.shared.f32 f416, [r13+11264];
ld.shared.f32 f417, [r13+12288];
ld.shared.f32 f418, [r13+13312];
ld.shared.f32 f419, [r13+14336];
ld.shared.f32 f420, [r13+15360];
add.f32 f421, f389, f397;
add.f32 f422, f405, f413;
sub.f32 f423, f389, f397;
sub.f32 f424, f405, f413;
add.f32 f425, f393, f401;
add.f32 f426, f409, f417;
sub.f32 f427, f393, f401;
sub.f32 f428, f409, f417;
add.f32 f429, f421, f425;
add.f32 f430, f422, f426;
sub.f32 f431, f421, f425;
sub.f32 f432, f422, f426;
sub.f32 f433, f423, f428;
add.f32 f434, f424, f427;
add.f32 f435, f423, f428;
sub.f32 f436, f424, f427;
add.f32 f437, f391, f399;
add.f32 f438, f407, f415;
sub.f32 f439, f391, f399;
sub.f32 f440, f407, f415;
add.f32 f441, f395, f403;
add.f32 f442, f411, f419;
sub.f32 f443, f395, f403;
sub.f32 f444, f411, f419;
add.f32 f445, f437, f441;
add.f32 f446, f438, f442;
sub.f32 f447, f437, f441;
sub.f32 f448, f438, f442;
sub.f32 f449, f439, f444;
add.f32 f450, f440, f443;
add.f32 f451, f439, f444;
sub.f32 f452, f440, f443;
mul.f32 f453, f449, 0f3F3504F3;
mul.f32 f454, f450, 0f3F3504F3;
sub.f32 f455, f453, f454;
add.f32 f456, f453, f454;
mul.f32 f457, f451, 0fBF3504F3;
mul.f32 f458, f452, 0f3F3504F3;
sub.f32 f459, f457, f458;
mul.f32 f460, f452, 0fBF3504F3;
fma.rn.f32 f461, f451, 0f3F3504F3, f460;
add.f32 f462, f429, f445;
add.f32 f463, f430, f446;
sub.f32 f464, f429, f445;
sub.f32 f465, f430, f446;
add.f32 f466, f433, f455;
add.f32 f467, f434, f456;
sub.f32 f468, f433, f455;
sub.f32 f469, f434, f456;
sub.f32 f470, f431, f448;
add.f32 f471, f432, f447;
add.f32 f472, f431, f448;
sub.f32 f473, f432, f447;
add.f32 f474, f435, f459;
add.f32 f475, f436, f461;
sub.f32 f476, f435, f459;
sub.f32 f477, f436, f461;
add.f32 f478, f390, f398;
add.f32 f479, f406, f414;
sub.f32 f480, f390, f398;
sub.f32 f481, f406, f414;
add.f32 f482, f394, f402;
add.f32 f483, f410, f418;
sub.f32 f484, f394, f402;
sub.f32 f485, f410, f418;
add.f32 f486, f478, f482;
add.f32 f487, f479, f483;
sub.f32 f488, f478, f482;
sub.f32 f489, f479, f483;
sub.f32 f490, f480, f485;
add.f32 f491, f481, f484;
add.f32 f492, f480, f485;
sub.f32 f493, f481, f484;
add.f32 f494, f392, f400;
add.f32 f495, f408, f416;
sub.f32 f496, f392, f400;
sub.f32 f497, f408, f416;
add.f32 f498, f396, f404;
add.f32 f499, f412, f420;
sub.f32 f500, f396, f404;
sub.f32 f501, f412, f420;
add.f32 f502, f494, f498;
add.f32 f503, f495, f499;
sub.f32 f504, f494, f498;
sub.f32 f505, f495, f499;
sub.f32 f506, f496, f501;
add.f32 f507, f497, f500;
add.f32 f508, f496, f501;
sub.f32 f509, f497, f500;
mul.f32 f510, f506, 0f3F3504F3;
mul.f32 f511, f507, 0f3F3504F3;
sub.f32 f512, f510, f511;
add.f32 f513, f510, f511;
mul.f32 f514, f508, 0fBF3504F3;
mul.f32 f515, f509, 0f3F3504F3;
sub.f32 f516, f514, f515;
mul.f32 f517, f509, 0fBF3504F3;
fma.rn.f32 f518, f508, 0f3F3504F3, f517;
add.f32 f519, f486, f502;
add.f32 f520, f487, f503;
sub.f32 f521, f486, f502;
sub.f32 f522, f487, f503;
add.f32 f523, f490, f512;
add.f32 f524, f491, f513;
sub.f32 f525, f490, f512;
sub.f32 f526, f491, f513;
sub.f32 f527, f488, f505;
add.f32 f528, f489, f504;
add.f32 f529, f488, f505;
sub.f32 f530, f489, f504;
add.f32 f531, f492, f516;
add.f32 f532, f493, f518;
sub.f32 f533, f492, f516;
sub.f32 f534, f493, f518;
mul.f32 f535, f523, 0f3F6C835E;
mul.f32 f536, f524, 0f3EC3EF15;
sub.f32 f537, f535, f536;
mul.f32 f538, f524, 0f3F6C835E;
fma.rn.f32 f539, f523, 0f3EC3EF15, f538;
mul.f32 f540, f527, 0f3F3504F3;
mul.f32 f541, f528, 0f3F3504F3;
sub.f32 f542, f540, f541;
add.f32 f543, f540, f541;
mul.f32 f544, f531, 0f3EC3EF15;
mul.f32 f545, f532, 0f3F6C835E;
sub.f32 f546, f544, f545;
mul.f32 f547, f532, 0f3EC3EF15;
fma.rn.f32 f548, f531, 0f3F6C835E, f547;
mul.f32 f549, f525, 0fBEC3EF15;
mul.f32 f550, f526, 0f3F6C835E;
sub.f32 f551, f549, f550;
mul.f32 f552, f526, 0fBEC3EF15;
fma.rn.f32 f553, f525, 0f3F6C835E, f552;
mul.f32 f554, f529, 0fBF3504F3;
mul.f32 f555, f530, 0f3F3504F3;
sub.f32 f556, f554, f555;
mul.f32 f557, f530, 0fBF3504F3;
fma.rn.f32 f558, f529, 0f3F3504F3, f557;
mul.f32 f559, f533, 0fBF6C835E;
mul.f32 f560, f534, 0f3EC3EF15;
sub.f32 f561, f559, f560;
mul.f32 f562, f534, 0fBF6C835E;
fma.rn.f32 f563, f533, 0f3EC3EF15, f562;
add.f32 f564, f462, f519;
add.f32 f565, f463, f520;
sub.f32 f566, f462, f519;
sub.f32 f567, f463, f520;
add.f32 f568, f466, f537;
add.f32 f569, f467, f539;
sub.f32 f570, f466, f537;
sub.f32 f571, f467, f539;
add.f32 f572, f470, f542;
add.f32 f573, f471, f543;
sub.f32 f574, f470, f542;
sub.f32 f575, f471, f543;
add.f32 f576, f474, f546;
add.f32 f577, f475, f548;
sub.f32 f578, f474, f546;
sub.f32 f579, f475, f548;
sub.f32 f580, f464, f522;
add.f32 f581, f465, f521;
add.f32 f582, f464, f522;
sub.f32 f583, f465, f521;
add.f32 f584, f468, f551;
add.f32 f585, f469, f553;
sub.f32 f586, f468, f551;
sub.f32 f587, f469, f553;
add.f32 f588, f472, f556;
add.f32 f589, f473, f558;
sub.f32 f590, f472, f556;
sub.f32 f591, f473, f558;
add.f32 f592, f476, f561;
add.f32 f593, f477, f563;
sub.f32 f594, f476, f561;
sub.f32 f595, f477, f563;
and.b32 r14, r5, 240;
bfe.u32 r15, r5, 4, 4;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %34;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f596, f597}, [rd8];
mul.f32 f600, f569, f597;
fma.rn.f32 f601, f596, f568, f600;
mul.f32 f602, f568, f597;
mul.f32 f603, f596, f569;
sub.f32 f604, f603, f602;
mul.f32 f605, f596, f596;
mul.f32 f606, f597, f597;
sub.f32 f607, f605, f606;
mul.f32 f608, f597, f596;
fma.rn.f32 f609, f597, f596, f608;
mul.f32 f610, f573, f609;
fma.rn.f32 f611, f607, f572, f610;
mul.f32 f612, f572, f609;
mul.f32 f613, f607, f573;
sub.f32 f614, f613, f612;
mul.f32 f615, f596, f607;
mul.f32 f616, f597, f609;
sub.f32 f617, f615, f616;
mul.f32 f618, f596, f609;
fma.rn.f32 f619, f597, f607, f618;
mul.f32 f620, f577, f619;
fma.rn.f32 f621, f617, f576, f620;
mul.f32 f622, f576, f619;
mul.f32 f623, f617, f577;
sub.f32 f624, f623, f622;
mul.f32 f625, f596, f617;
mul.f32 f626, f597, f619;
sub.f32 f627, f625, f626;
mul.f32 f628, f596, f619;
fma.rn.f32 f629, f597, f617, f628;
mul.f32 f630, f581, f629;
fma.rn.f32 f631, f627, f580, f630;
mul.f32 f632, f580, f629;
mul.f32 f633, f627, f581;
sub.f32 f634, f633, f632;
mul.f32 f635, f596, f627;
mul.f32 f636, f597, f629;
sub.f32 f637, f635, f636;
mul.f32 f638, f596, f629;
fma.rn.f32 f639, f597, f627, f638;
mul.f32 f640, f585, f639;
fma.rn.f32 f641, f637, f584, f640;
mul.f32 f642, f584, f639;
mul.f32 f643, f637, f585;
sub.f32 f644, f643, f642;
mul.f32 f645, f596, f637;
mul.f32 f646, f597, f639;
sub.f32 f647, f645, f646;
mul.f32 f648, f596, f639;
fma.rn.f32 f649, f597, f637, f648;
mul.f32 f650, f589, f649;
fma.rn.f32 f651, f647, f588, f650;
mul.f32 f652, f588, f649;
mul.f32 f653, f647, f589;
sub.f32 f654, f653, f652;
mul.f32 f655, f596, f647;
mul.f32 f656, f597, f649;
sub.f32 f657, f655, f656;
mul.f32 f658, f596, f649;
fma.rn.f32 f659, f597, f647, f658;
mul.f32 f660, f593, f659;
fma.rn.f32 f661, f657, f592, f660;
mul.f32 f662, f592, f659;
mul.f32 f663, f657, f593;
sub.f32 f664, f663, f662;
mul.f32 f665, f596, f657;
mul.f32 f666, f597, f659;
sub.f32 f667, f665, f666;
mul.f32 f668, f596, f659;
fma.rn.f32 f669, f597, f657, f668;
mul.f32 f670, f567, f669;
fma.rn.f32 f671, f667, f566, f670;
mul.f32 f672, f566, f669;
mul.f32 f673, f667, f567;
sub.f32 f674, f673, f672;
mul.f32 f675, f596, f667;
mul.f32 f676, f597, f669;
sub.f32 f677, f675, f676;
mul.f32 f678, f596, f669;
fma.rn.f32 f679, f597, f667, f678;
mul.f32 f680, f571, f679;
fma.rn.f32 f681, f677, f570, f680;
mul.f32 f682, f570, f679;
mul.f32 f683, f677, f571;
sub.f32 f684, f683, f682;
mul.f32 f685, f596, f677;
mul.f32 f686, f597, f679;
sub.f32 f687, f685, f686;
mul.f32 f688, f596, f679;
fma.rn.f32 f689, f597, f677, f688;
mul.f32 f690, f575, f689;
fma.rn.f32 f691, f687, f574, f690;
mul.f32 f692, f574, f689;
mul.f32 f693, f687, f575;
sub.f32 f694, f693, f692;
mul.f32 f695, f596, f687;
mul.f32 f696, f597, f689;
sub.f32 f697, f695, f696;
mul.f32 f698, f596, f689;
fma.rn.f32 f699, f597, f687, f698;
mul.f32 f700, f579, f699;
fma.rn.f32 f701, f697, f578, f700;
mul.f32 f702, f578, f699;
mul.f32 f703, f697, f579;
sub.f32 f704, f703, f702;
mul.f32 f705, f596, f697;
mul.f32 f706, f597, f699;
sub.f32 f707, f705, f706;
mul.f32 f708, f596, f699;
fma.rn.f32 f709, f597, f697, f708;
mul.f32 f710, f583, f709;
fma.rn.f32 f711, f707, f582, f710;
mul.f32 f712, f582, f709;
mul.f32 f713, f707, f583;
sub.f32 f714, f713, f712;
mul.f32 f715, f596, f707;
mul.f32 f716, f597, f709;
sub.f32 f717, f715, f716;
mul.f32 f718, f596, f709;
fma.rn.f32 f719, f597, f707, f718;
mul.f32 f720, f587, f719;
fma.rn.f32 f721, f717, f586, f720;
mul.f32 f722, f586, f719;
mul.f32 f723, f717, f587;
sub.f32 f724, f723, f722;
mul.f32 f725, f596, f717;
mul.f32 f726, f597, f719;
sub.f32 f727, f725, f726;
mul.f32 f728, f596, f719;
fma.rn.f32 f729, f597, f717, f728;
mul.f32 f730, f591, f729;
fma.rn.f32 f731, f727, f590, f730;
mul.f32 f732, f590, f729;
mul.f32 f733, f727, f591;
sub.f32 f734, f733, f732;
mul.f32 f735, f596, f727;
mul.f32 f736, f597, f729;
sub.f32 f737, f735, f736;
mul.f32 f738, f596, f729;
fma.rn.f32 f739, f597, f727, f738;
mul.f32 f740, f595, f739;
fma.rn.f32 f741, f737, f594, f740;
mul.f32 f742, f594, f739;
mul.f32 f743, f737, f595;
sub.f32 f744, f743, f742;
shl.b32 r16, r5, 2;
and.b32 r17, r16, 60;
add.s32 r18, r10, r17;
barrier.sync 0;
and.b32 r19, r8, 15360;
add.s32 r20, r18, r19;
st.shared.f32 [r20], f564;
st.shared.f32 [r20+64], f601;
st.shared.f32 [r20+128], f611;
st.shared.f32 [r20+192], f621;
st.shared.f32 [r20+256], f631;
st.shared.f32 [r20+320], f641;
st.shared.f32 [r20+384], f651;
st.shared.f32 [r20+448], f661;
st.shared.f32 [r20+512], f671;
st.shared.f32 [r20+576], f681;
st.shared.f32 [r20+640], f691;
st.shared.f32 [r20+704], f701;
st.shared.f32 [r20+768], f711;
st.shared.f32 [r20+832], f721;
st.shared.f32 [r20+896], f731;
st.shared.f32 [r20+960], f741;
barrier.sync 0;
mad.lo.s32 r21, r14, -60, r20;
ld.shared.f32 f745, [r21];
ld.shared.f32 f746, [r21+1024];
ld.shared.f32 f747, [r21+2048];
ld.shared.f32 f748, [r21+3072];
ld.shared.f32 f749, [r21+4096];
ld.shared.f32 f750, [r21+5120];
ld.shared.f32 f751, [r21+6144];
ld.shared.f32 f752, [r21+7168];
ld.shared.f32 f753, [r21+8192];
ld.shared.f32 f754, [r21+9216];
ld.shared.f32 f755, [r21+10240];
ld.shared.f32 f756, [r21+11264];
ld.shared.f32 f757, [r21+12288];
ld.shared.f32 f758, [r21+13312];
ld.shared.f32 f759, [r21+14336];
ld.shared.f32 f760, [r21+15360];
barrier.sync 0;
st.shared.f32 [r20], f565;
st.shared.f32 [r20+64], f604;
st.shared.f32 [r20+128], f614;
st.shared.f32 [r20+192], f624;
st.shared.f32 [r20+256], f634;
st.shared.f32 [r20+320], f644;
st.shared.f32 [r20+384], f654;
st.shared.f32 [r20+448], f664;
st.shared.f32 [r20+512], f674;
st.shared.f32 [r20+576], f684;
st.shared.f32 [r20+640], f694;
st.shared.f32 [r20+704], f704;
st.shared.f32 [r20+768], f714;
st.shared.f32 [r20+832], f724;
st.shared.f32 [r20+896], f734;
st.shared.f32 [r20+960], f744;
barrier.sync 0;
ld.shared.f32 f761, [r21];
ld.shared.f32 f762, [r21+1024];
ld.shared.f32 f763, [r21+2048];
ld.shared.f32 f764, [r21+3072];
ld.shared.f32 f765, [r21+4096];
ld.shared.f32 f766, [r21+5120];
ld.shared.f32 f767, [r21+6144];
ld.shared.f32 f768, [r21+7168];
ld.shared.f32 f769, [r21+8192];
ld.shared.f32 f770, [r21+9216];
ld.shared.f32 f771, [r21+10240];
ld.shared.f32 f772, [r21+11264];
ld.shared.f32 f773, [r21+12288];
ld.shared.f32 f774, [r21+13312];
ld.shared.f32 f775, [r21+14336];
ld.shared.f32 f776, [r21+15360];
add.f32 f777, f745, f753;
add.f32 f778, f761, f769;
sub.f32 f779, f745, f753;
sub.f32 f780, f761, f769;
add.f32 f781, f749, f757;
add.f32 f782, f765, f773;
sub.f32 f783, f749, f757;
sub.f32 f784, f765, f773;
add.f32 f785, f777, f781;
add.f32 f786, f778, f782;
sub.f32 f787, f777, f781;
sub.f32 f788, f778, f782;
sub.f32 f789, f779, f784;
add.f32 f790, f780, f783;
add.f32 f791, f779, f784;
sub.f32 f792, f780, f783;
add.f32 f793, f747, f755;
add.f32 f794, f763, f771;
sub.f32 f795, f747, f755;
sub.f32 f796, f763, f771;
add.f32 f797, f751, f759;
add.f32 f798, f767, f775;
sub.f32 f799, f751, f759;
sub.f32 f800, f767, f775;
add.f32 f801, f793, f797;
add.f32 f802, f794, f798;
sub.f32 f803, f793, f797;
sub.f32 f804, f794, f798;
sub.f32 f805, f795, f800;
add.f32 f806, f796, f799;
add.f32 f807, f795, f800;
sub.f32 f808, f796, f799;
mul.f32 f809, f805, 0f3F3504F3;
mul.f32 f810, f806, 0f3F3504F3;
sub.f32 f811, f809, f810;
add.f32 f812, f809, f810;
mul.f32 f813, f807, 0fBF3504F3;
mul.f32 f814, f808, 0f3F3504F3;
sub.f32 f815, f813, f814;
mul.f32 f816, f808, 0fBF3504F3;
fma.rn.f32 f817, f807, 0f3F3504F3, f816;
add.f32 f818, f785, f801;
add.f32 f819, f786, f802;
sub.f32 f820, f785, f801;
sub.f32 f821, f786, f802;
add.f32 f822, f789, f811;
add.f32 f823, f790, f812;
sub.f32 f824, f789, f811;
sub.f32 f825, f790, f812;
sub.f32 f826, f787, f804;
add.f32 f827, f788, f803;
add.f32 f828, f787, f804;
sub.f32 f829, f788, f803;
add.f32 f830, f791, f815;
add.f32 f831, f792, f817;
sub.f32 f832, f791, f815;
sub.f32 f833, f792, f817;
add.f32 f834, f746, f754;
add.f32 f835, f762, f770;
sub.f32 f836, f746, f754;
sub.f32 f837, f762, f770;
add.f32 f838, f750, f758;
add.f32 f839, f766, f774;
sub.f32 f840, f750, f758;
sub.f32 f841, f766, f774;
add.f32 f842, f834, f838;
add.f32 f843, f835, f839;
sub.f32 f844, f834, f838;
sub.f32 f845, f835, f839;
sub.f32 f846, f836, f841;
add.f32 f847, f837, f840;
add.f32 f848, f836, f841;
sub.f32 f849, f837, f840;
add.f32 f850, f748, f756;
add.f32 f851, f764, f772;
sub.f32 f852, f748, f756;
sub.f32 f853, f764, f772;
add.f32 f854, f752, f760;
add.f32 f855, f768, f776;
sub.f32 f856, f752, f760;
sub.f32 f857, f768, f776;
add.f32 f858, f850, f854;
add.f32 f859, f851, f855;
sub.f32 f860, f850, f854;
sub.f32 f861, f851, f855;
sub.f32 f862, f852, f857;
add.f32 f863, f853, f856;
add.f32 f864, f852, f857;
sub.f32 f865, f853, f856;
mul.f32 f866, f862, 0f3F3504F3;
mul.f32 f867, f863, 0f3F3504F3;
sub.f32 f868, f866, f867;
add.f32 f869, f866, f867;
mul.f32 f870, f864, 0fBF3504F3;
mul.f32 f871, f865, 0f3F3504F3;
sub.f32 f872, f870, f871;
mul.f32 f873, f865, 0fBF3504F3;
fma.rn.f32 f874, f864, 0f3F3504F3, f873;
add.f32 f875, f842, f858;
add.f32 f876, f843, f859;
sub.f32 f877, f842, f858;
sub.f32 f878, f843, f859;
add.f32 f879, f846, f868;
add.f32 f880, f847, f869;
sub.f32 f881, f846, f868;
sub.f32 f882, f847, f869;
sub.f32 f883, f844, f861;
add.f32 f884, f845, f860;
add.f32 f885, f844, f861;
sub.f32 f886, f845, f860;
add.f32 f887, f848, f872;
add.f32 f888, f849, f874;
sub.f32 f889, f848, f872;
sub.f32 f890, f849, f874;
mul.f32 f891, f879, 0f3F6C835E;
mul.f32 f892, f880, 0f3EC3EF15;
sub.f32 f893, f891, f892;
mul.f32 f894, f880, 0f3F6C835E;
fma.rn.f32 f895, f879, 0f3EC3EF15, f894;
mul.f32 f896, f883, 0f3F3504F3;
mul.f32 f897, f884, 0f3F3504F3;
sub.f32 f898, f896, f897;
add.f32 f899, f896, f897;
mul.f32 f900, f887, 0f3EC3EF15;
mul.f32 f901, f888, 0f3F6C835E;
sub.f32 f902, f900, f901;
mul.f32 f903, f888, 0f3EC3EF15;
fma.rn.f32 f904, f887, 0f3F6C835E, f903;
mul.f32 f905, f881, 0fBEC3EF15;
mul.f32 f906, f882, 0f3F6C835E;
sub.f32 f907, f905, f906;
mul.f32 f908, f882, 0fBEC3EF15;
fma.rn.f32 f909, f881, 0f3F6C835E, f908;
mul.f32 f910, f885, 0fBF3504F3;
mul.f32 f911, f886, 0f3F3504F3;
sub.f32 f912, f910, f911;
mul.f32 f913, f886, 0fBF3504F3;
fma.rn.f32 f914, f885, 0f3F3504F3, f913;
mul.f32 f915, f889, 0fBF6C835E;
mul.f32 f916, f890, 0f3EC3EF15;
sub.f32 f917, f915, f916;
mul.f32 f918, f890, 0fBF6C835E;
fma.rn.f32 f919, f889, 0f3EC3EF15, f918;
add.f32 %0, f818, f875;
add.f32 %1, f819, f876;
add.f32 %3, f823, f895;
add.f32 %2, f822, f893;
add.f32 %5, f827, f899;
add.f32 %4, f826, f898;
add.f32 %7, f831, f904;
add.f32 %6, f830, f902;
add.f32 %9, f821, f877;
sub.f32 %8, f820, f878;
add.f32 %11, f825, f909;
add.f32 %10, f824, f907;
add.f32 %13, f829, f914;
add.f32 %12, f828, f912;
add.f32 %15, f833, f919;
add.f32 %14, f832, f917;
sub.f32 %16, f818, f875;
sub.f32 %17, f819, f876;
sub.f32 %19, f823, f895;
sub.f32 %18, f822, f893;
sub.f32 %21, f827, f899;
sub.f32 %20, f826, f898;
sub.f32 %23, f831, f904;
sub.f32 %22, f830, f902;
sub.f32 %25, f821, f877;
add.f32 %24, f820, f878;
sub.f32 %27, f825, f909;
sub.f32 %26, f824, f907;
sub.f32 %29, f829, f914;
sub.f32 %28, f828, f912;
sub.f32 %31, f833, f919;
sub.f32 %30, f832, f917;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_4096), "l"(lut_sp_16_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<305, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<516>;
.reg .b32 r<28>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 14;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f33, %20, %30;
add.f32 f34, %21, %32;
sub.f32 f35, %20, %30;
sub.f32 f36, %21, %32;
add.f32 f37, %25, %36;
add.f32 f38, %27, %37;
sub.f32 f39, %25, %36;
sub.f32 f40, %27, %37;
add.f32 f41, f33, f37;
add.f32 f42, f34, f38;
sub.f32 f43, f33, f37;
sub.f32 f44, f34, f38;
sub.f32 f45, f35, f40;
add.f32 f46, f36, f39;
add.f32 f47, f35, f40;
sub.f32 f48, f36, f39;
add.f32 f49, %22, %33;
add.f32 f50, %24, %35;
sub.f32 f51, %22, %33;
sub.f32 f52, %24, %35;
add.f32 f53, %28, %38;
add.f32 f54, %29, %39;
sub.f32 f55, %28, %38;
sub.f32 f56, %29, %39;
add.f32 f57, f49, f53;
add.f32 f58, f50, f54;
sub.f32 f59, f49, f53;
sub.f32 f60, f50, f54;
sub.f32 f61, f51, f56;
add.f32 f62, f52, f55;
add.f32 f63, f51, f56;
sub.f32 f64, f52, f55;
mul.f32 f65, f61, 0f3F3504F3;
mul.f32 f66, f62, 0f3F3504F3;
sub.f32 f67, f65, f66;
add.f32 f68, f65, f66;
mul.f32 f69, f63, 0fBF3504F3;
mul.f32 f70, f64, 0f3F3504F3;
sub.f32 f71, f69, f70;
mul.f32 f72, f64, 0fBF3504F3;
fma.rn.f32 f73, f63, 0f3F3504F3, f72;
add.f32 f74, f41, f57;
add.f32 f75, f42, f58;
sub.f32 f76, f41, f57;
sub.f32 f77, f42, f58;
add.f32 f78, f45, f67;
add.f32 f79, f46, f68;
sub.f32 f80, f45, f67;
sub.f32 f81, f46, f68;
sub.f32 f82, f43, f60;
add.f32 f83, f44, f59;
add.f32 f84, f43, f60;
sub.f32 f85, f44, f59;
add.f32 f86, f47, f71;
add.f32 f87, f48, f73;
sub.f32 f88, f47, f71;
sub.f32 f89, f48, f73;
and.b32 r6, r5, 511;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 4088;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f90, f91}, [rd5];
mul.f32 f94, f79, f91;
fma.rn.f32 f95, f90, f78, f94;
mul.f32 f96, f78, f91;
mul.f32 f97, f90, f79;
sub.f32 f98, f97, f96;
mul.f32 f99, f90, f90;
mul.f32 f100, f91, f91;
sub.f32 f101, f99, f100;
mul.f32 f102, f91, f90;
fma.rn.f32 f103, f91, f90, f102;
mul.f32 f104, f83, f103;
fma.rn.f32 f105, f101, f82, f104;
mul.f32 f106, f82, f103;
mul.f32 f107, f101, f83;
sub.f32 f108, f107, f106;
mul.f32 f109, f90, f101;
mul.f32 f110, f91, f103;
sub.f32 f111, f109, f110;
mul.f32 f112, f90, f103;
fma.rn.f32 f113, f91, f101, f112;
mul.f32 f114, f87, f113;
fma.rn.f32 f115, f111, f86, f114;
mul.f32 f116, f86, f113;
mul.f32 f117, f111, f87;
sub.f32 f118, f117, f116;
mul.f32 f119, f90, f111;
mul.f32 f120, f91, f113;
sub.f32 f121, f119, f120;
mul.f32 f122, f90, f113;
fma.rn.f32 f123, f91, f111, f122;
mul.f32 f124, f77, f123;
fma.rn.f32 f125, f121, f76, f124;
mul.f32 f126, f76, f123;
mul.f32 f127, f121, f77;
sub.f32 f128, f127, f126;
mul.f32 f129, f90, f121;
mul.f32 f130, f91, f123;
sub.f32 f131, f129, f130;
mul.f32 f132, f90, f123;
fma.rn.f32 f133, f91, f121, f132;
mul.f32 f134, f81, f133;
fma.rn.f32 f135, f131, f80, f134;
mul.f32 f136, f80, f133;
mul.f32 f137, f131, f81;
sub.f32 f138, f137, f136;
mul.f32 f139, f90, f131;
mul.f32 f140, f91, f133;
sub.f32 f141, f139, f140;
mul.f32 f142, f90, f133;
fma.rn.f32 f143, f91, f131, f142;
mul.f32 f144, f85, f143;
fma.rn.f32 f145, f141, f84, f144;
mul.f32 f146, f84, f143;
mul.f32 f147, f141, f85;
sub.f32 f148, f147, f146;
mul.f32 f149, f90, f141;
mul.f32 f150, f91, f143;
sub.f32 f151, f149, f150;
mul.f32 f152, f90, f143;
fma.rn.f32 f153, f91, f141, f152;
mul.f32 f154, f89, f153;
fma.rn.f32 f155, f151, f88, f154;
mul.f32 f156, f88, f153;
mul.f32 f157, f151, f89;
sub.f32 f158, f157, f156;
shl.b32 r8, r5, 5;
and.b32 r9, r8, -16384;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 16352;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f74, f95, f105, f115};
st.shared.v4.f32 [r12+16], {f125, f135, f145, f155};
barrier.sync 0;
mad.lo.s32 r13, r6, -28, r12;
ld.shared.f32 f159, [r13];
ld.shared.f32 f160, [r13+2048];
ld.shared.f32 f161, [r13+4096];
ld.shared.f32 f162, [r13+6144];
ld.shared.f32 f163, [r13+8192];
ld.shared.f32 f164, [r13+10240];
ld.shared.f32 f165, [r13+12288];
ld.shared.f32 f166, [r13+14336];
barrier.sync 0;
st.shared.v4.f32 [r12], {f75, f98, f108, f118};
st.shared.v4.f32 [r12+16], {f128, f138, f148, f158};
barrier.sync 0;
ld.shared.f32 f167, [r13];
ld.shared.f32 f168, [r13+2048];
ld.shared.f32 f169, [r13+4096];
ld.shared.f32 f170, [r13+6144];
ld.shared.f32 f171, [r13+8192];
ld.shared.f32 f172, [r13+10240];
ld.shared.f32 f173, [r13+12288];
ld.shared.f32 f174, [r13+14336];
add.f32 f175, f159, f163;
add.f32 f176, f167, f171;
sub.f32 f177, f159, f163;
sub.f32 f178, f167, f171;
add.f32 f179, f161, f165;
add.f32 f180, f169, f173;
sub.f32 f181, f161, f165;
sub.f32 f182, f169, f173;
add.f32 f183, f175, f179;
add.f32 f184, f176, f180;
sub.f32 f185, f175, f179;
sub.f32 f186, f176, f180;
sub.f32 f187, f177, f182;
add.f32 f188, f178, f181;
add.f32 f189, f177, f182;
sub.f32 f190, f178, f181;
add.f32 f191, f160, f164;
add.f32 f192, f168, f172;
sub.f32 f193, f160, f164;
sub.f32 f194, f168, f172;
add.f32 f195, f162, f166;
add.f32 f196, f170, f174;
sub.f32 f197, f162, f166;
sub.f32 f198, f170, f174;
add.f32 f199, f191, f195;
add.f32 f200, f192, f196;
sub.f32 f201, f191, f195;
sub.f32 f202, f192, f196;
sub.f32 f203, f193, f198;
add.f32 f204, f194, f197;
add.f32 f205, f193, f198;
sub.f32 f206, f194, f197;
mul.f32 f207, f203, 0f3F3504F3;
mul.f32 f208, f204, 0f3F3504F3;
sub.f32 f209, f207, f208;
add.f32 f210, f207, f208;
mul.f32 f211, f205, 0fBF3504F3;
mul.f32 f212, f206, 0f3F3504F3;
sub.f32 f213, f211, f212;
mul.f32 f214, f206, 0fBF3504F3;
fma.rn.f32 f215, f205, 0f3F3504F3, f214;
add.f32 f216, f183, f199;
add.f32 f217, f184, f200;
sub.f32 f218, f183, f199;
sub.f32 f219, f184, f200;
add.f32 f220, f187, f209;
add.f32 f221, f188, f210;
sub.f32 f222, f187, f209;
sub.f32 f223, f188, f210;
sub.f32 f224, f185, f202;
add.f32 f225, f186, f201;
add.f32 f226, f185, f202;
sub.f32 f227, f186, f201;
add.f32 f228, f189, f213;
add.f32 f229, f190, f215;
sub.f32 f230, f189, f213;
sub.f32 f231, f190, f215;
and.b32 r14, r5, 504;
cvt.u64.u32 rd6, r14;
mov.u64 rd7, %18;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f232, f233}, [rd8];
mul.f32 f236, f221, f233;
fma.rn.f32 f237, f232, f220, f236;
mul.f32 f238, f220, f233;
mul.f32 f239, f232, f221;
sub.f32 f240, f239, f238;
mul.f32 f241, f232, f232;
mul.f32 f242, f233, f233;
sub.f32 f243, f241, f242;
mul.f32 f244, f233, f232;
fma.rn.f32 f245, f233, f232, f244;
mul.f32 f246, f225, f245;
fma.rn.f32 f247, f243, f224, f246;
mul.f32 f248, f224, f245;
mul.f32 f249, f243, f225;
sub.f32 f250, f249, f248;
mul.f32 f251, f232, f243;
mul.f32 f252, f233, f245;
sub.f32 f253, f251, f252;
mul.f32 f254, f232, f245;
fma.rn.f32 f255, f233, f243, f254;
mul.f32 f256, f229, f255;
fma.rn.f32 f257, f253, f228, f256;
mul.f32 f258, f228, f255;
mul.f32 f259, f253, f229;
sub.f32 f260, f259, f258;
mul.f32 f261, f232, f253;
mul.f32 f262, f233, f255;
sub.f32 f263, f261, f262;
mul.f32 f264, f232, f255;
fma.rn.f32 f265, f233, f253, f264;
mul.f32 f266, f219, f265;
fma.rn.f32 f267, f263, f218, f266;
mul.f32 f268, f218, f265;
mul.f32 f269, f263, f219;
sub.f32 f270, f269, f268;
mul.f32 f271, f232, f263;
mul.f32 f272, f233, f265;
sub.f32 f273, f271, f272;
mul.f32 f274, f232, f265;
fma.rn.f32 f275, f233, f263, f274;
mul.f32 f276, f223, f275;
fma.rn.f32 f277, f273, f222, f276;
mul.f32 f278, f222, f275;
mul.f32 f279, f273, f223;
sub.f32 f280, f279, f278;
mul.f32 f281, f232, f273;
mul.f32 f282, f233, f275;
sub.f32 f283, f281, f282;
mul.f32 f284, f232, f275;
fma.rn.f32 f285, f233, f273, f284;
mul.f32 f286, f227, f285;
fma.rn.f32 f287, f283, f226, f286;
mul.f32 f288, f226, f285;
mul.f32 f289, f283, f227;
sub.f32 f290, f289, f288;
mul.f32 f291, f232, f283;
mul.f32 f292, f233, f285;
sub.f32 f293, f291, f292;
mul.f32 f294, f232, f285;
fma.rn.f32 f295, f233, f283, f294;
mul.f32 f296, f231, f295;
fma.rn.f32 f297, f293, f230, f296;
mul.f32 f298, f230, f295;
mul.f32 f299, f293, f231;
sub.f32 f300, f299, f298;
shl.b32 r15, r5, 2;
and.b32 r16, r15, 28;
add.s32 r17, r10, r16;
barrier.sync 0;
and.b32 r18, r8, 16128;
add.s32 r19, r17, r18;
st.shared.f32 [r19], f216;
st.shared.f32 [r19+32], f237;
st.shared.f32 [r19+64], f247;
st.shared.f32 [r19+96], f257;
st.shared.f32 [r19+128], f267;
st.shared.f32 [r19+160], f277;
st.shared.f32 [r19+192], f287;
st.shared.f32 [r19+224], f297;
barrier.sync 0;
mad.lo.s32 r20, r14, -28, r19;
ld.shared.f32 f301, [r20];
ld.shared.f32 f302, [r20+2048];
ld.shared.f32 f303, [r20+4096];
ld.shared.f32 f304, [r20+6144];
ld.shared.f32 f305, [r20+8192];
ld.shared.f32 f306, [r20+10240];
ld.shared.f32 f307, [r20+12288];
ld.shared.f32 f308, [r20+14336];
barrier.sync 0;
st.shared.f32 [r19], f217;
st.shared.f32 [r19+32], f240;
st.shared.f32 [r19+64], f250;
st.shared.f32 [r19+96], f260;
st.shared.f32 [r19+128], f270;
st.shared.f32 [r19+160], f280;
st.shared.f32 [r19+192], f290;
st.shared.f32 [r19+224], f300;
barrier.sync 0;
ld.shared.f32 f309, [r20];
ld.shared.f32 f310, [r20+2048];
ld.shared.f32 f311, [r20+4096];
ld.shared.f32 f312, [r20+6144];
ld.shared.f32 f313, [r20+8192];
ld.shared.f32 f314, [r20+10240];
ld.shared.f32 f315, [r20+12288];
ld.shared.f32 f316, [r20+14336];
add.f32 f317, f301, f305;
add.f32 f318, f309, f313;
sub.f32 f319, f301, f305;
sub.f32 f320, f309, f313;
add.f32 f321, f303, f307;
add.f32 f322, f311, f315;
sub.f32 f323, f303, f307;
sub.f32 f324, f311, f315;
add.f32 f325, f317, f321;
add.f32 f326, f318, f322;
sub.f32 f327, f317, f321;
sub.f32 f328, f318, f322;
sub.f32 f329, f319, f324;
add.f32 f330, f320, f323;
add.f32 f331, f319, f324;
sub.f32 f332, f320, f323;
add.f32 f333, f302, f306;
add.f32 f334, f310, f314;
sub.f32 f335, f302, f306;
sub.f32 f336, f310, f314;
add.f32 f337, f304, f308;
add.f32 f338, f312, f316;
sub.f32 f339, f304, f308;
sub.f32 f340, f312, f316;
add.f32 f341, f333, f337;
add.f32 f342, f334, f338;
sub.f32 f343, f333, f337;
sub.f32 f344, f334, f338;
sub.f32 f345, f335, f340;
add.f32 f346, f336, f339;
add.f32 f347, f335, f340;
sub.f32 f348, f336, f339;
mul.f32 f349, f345, 0f3F3504F3;
mul.f32 f350, f346, 0f3F3504F3;
sub.f32 f351, f349, f350;
add.f32 f352, f349, f350;
mul.f32 f353, f347, 0fBF3504F3;
mul.f32 f354, f348, 0f3F3504F3;
sub.f32 f355, f353, f354;
mul.f32 f356, f348, 0fBF3504F3;
fma.rn.f32 f357, f347, 0f3F3504F3, f356;
add.f32 f358, f325, f341;
add.f32 f359, f326, f342;
sub.f32 f360, f325, f341;
sub.f32 f361, f326, f342;
add.f32 f362, f329, f351;
add.f32 f363, f330, f352;
sub.f32 f364, f329, f351;
sub.f32 f365, f330, f352;
sub.f32 f366, f327, f344;
add.f32 f367, f328, f343;
add.f32 f368, f327, f344;
sub.f32 f369, f328, f343;
add.f32 f370, f331, f355;
add.f32 f371, f332, f357;
sub.f32 f372, f331, f355;
sub.f32 f373, f332, f357;
and.b32 r21, r5, 448;
bfe.u32 r22, r5, 6, 3;
mul.wide.u32 rd9, r22, 8;
mov.u64 rd10, %19;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f374, f375}, [rd11];
mul.f32 f378, f363, f375;
fma.rn.f32 f379, f374, f362, f378;
mul.f32 f380, f362, f375;
mul.f32 f381, f374, f363;
sub.f32 f382, f381, f380;
mul.f32 f383, f374, f374;
mul.f32 f384, f375, f375;
sub.f32 f385, f383, f384;
mul.f32 f386, f375, f374;
fma.rn.f32 f387, f375, f374, f386;
mul.f32 f388, f367, f387;
fma.rn.f32 f389, f385, f366, f388;
mul.f32 f390, f366, f387;
mul.f32 f391, f385, f367;
sub.f32 f392, f391, f390;
mul.f32 f393, f374, f385;
mul.f32 f394, f375, f387;
sub.f32 f395, f393, f394;
mul.f32 f396, f374, f387;
fma.rn.f32 f397, f375, f385, f396;
mul.f32 f398, f371, f397;
fma.rn.f32 f399, f395, f370, f398;
mul.f32 f400, f370, f397;
mul.f32 f401, f395, f371;
sub.f32 f402, f401, f400;
mul.f32 f403, f374, f395;
mul.f32 f404, f375, f397;
sub.f32 f405, f403, f404;
mul.f32 f406, f374, f397;
fma.rn.f32 f407, f375, f395, f406;
mul.f32 f408, f361, f407;
fma.rn.f32 f409, f405, f360, f408;
mul.f32 f410, f360, f407;
mul.f32 f411, f405, f361;
sub.f32 f412, f411, f410;
mul.f32 f413, f374, f405;
mul.f32 f414, f375, f407;
sub.f32 f415, f413, f414;
mul.f32 f416, f374, f407;
fma.rn.f32 f417, f375, f405, f416;
mul.f32 f418, f365, f417;
fma.rn.f32 f419, f415, f364, f418;
mul.f32 f420, f364, f417;
mul.f32 f421, f415, f365;
sub.f32 f422, f421, f420;
mul.f32 f423, f374, f415;
mul.f32 f424, f375, f417;
sub.f32 f425, f423, f424;
mul.f32 f426, f374, f417;
fma.rn.f32 f427, f375, f415, f426;
mul.f32 f428, f369, f427;
fma.rn.f32 f429, f425, f368, f428;
mul.f32 f430, f368, f427;
mul.f32 f431, f425, f369;
sub.f32 f432, f431, f430;
mul.f32 f433, f374, f425;
mul.f32 f434, f375, f427;
sub.f32 f435, f433, f434;
mul.f32 f436, f374, f427;
fma.rn.f32 f437, f375, f425, f436;
mul.f32 f438, f373, f437;
fma.rn.f32 f439, f435, f372, f438;
mul.f32 f440, f372, f437;
mul.f32 f441, f435, f373;
sub.f32 f442, f441, f440;
and.b32 r23, r15, 252;
add.s32 r24, r10, r23;
barrier.sync 0;
and.b32 r25, r8, 14336;
add.s32 r26, r24, r25;
st.shared.f32 [r26], f358;
st.shared.f32 [r26+256], f379;
st.shared.f32 [r26+512], f389;
st.shared.f32 [r26+768], f399;
st.shared.f32 [r26+1024], f409;
st.shared.f32 [r26+1280], f419;
st.shared.f32 [r26+1536], f429;
st.shared.f32 [r26+1792], f439;
barrier.sync 0;
mad.lo.s32 r27, r21, -28, r26;
ld.shared.f32 f443, [r27];
ld.shared.f32 f444, [r27+2048];
ld.shared.f32 f445, [r27+4096];
ld.shared.f32 f446, [r27+6144];
ld.shared.f32 f447, [r27+8192];
ld.shared.f32 f448, [r27+10240];
ld.shared.f32 f449, [r27+12288];
ld.shared.f32 f450, [r27+14336];
barrier.sync 0;
st.shared.f32 [r26], f359;
st.shared.f32 [r26+256], f382;
st.shared.f32 [r26+512], f392;
st.shared.f32 [r26+768], f402;
st.shared.f32 [r26+1024], f412;
st.shared.f32 [r26+1280], f422;
st.shared.f32 [r26+1536], f432;
st.shared.f32 [r26+1792], f442;
barrier.sync 0;
ld.shared.f32 f451, [r27];
ld.shared.f32 f452, [r27+2048];
ld.shared.f32 f453, [r27+4096];
ld.shared.f32 f454, [r27+6144];
ld.shared.f32 f455, [r27+8192];
ld.shared.f32 f456, [r27+10240];
ld.shared.f32 f457, [r27+12288];
ld.shared.f32 f458, [r27+14336];
add.f32 f459, f443, f447;
add.f32 f460, f451, f455;
sub.f32 f461, f443, f447;
sub.f32 f462, f451, f455;
add.f32 f463, f445, f449;
add.f32 f464, f453, f457;
sub.f32 f465, f445, f449;
sub.f32 f466, f453, f457;
add.f32 f467, f459, f463;
add.f32 f468, f460, f464;
sub.f32 f469, f459, f463;
sub.f32 f470, f460, f464;
sub.f32 f471, f461, f466;
add.f32 f472, f462, f465;
add.f32 f473, f461, f466;
sub.f32 f474, f462, f465;
add.f32 f475, f444, f448;
add.f32 f476, f452, f456;
sub.f32 f477, f444, f448;
sub.f32 f478, f452, f456;
add.f32 f479, f446, f450;
add.f32 f480, f454, f458;
sub.f32 f481, f446, f450;
sub.f32 f482, f454, f458;
add.f32 f483, f475, f479;
add.f32 f484, f476, f480;
sub.f32 f485, f475, f479;
sub.f32 f486, f476, f480;
sub.f32 f487, f477, f482;
add.f32 f488, f478, f481;
add.f32 f489, f477, f482;
sub.f32 f490, f478, f481;
mul.f32 f491, f487, 0f3F3504F3;
mul.f32 f492, f488, 0f3F3504F3;
sub.f32 f493, f491, f492;
add.f32 f494, f491, f492;
mul.f32 f495, f489, 0fBF3504F3;
mul.f32 f496, f490, 0f3F3504F3;
sub.f32 f497, f495, f496;
mul.f32 f498, f490, 0fBF3504F3;
fma.rn.f32 f499, f489, 0f3F3504F3, f498;
add.f32 %0, f467, f483;
add.f32 %1, f468, f484;
add.f32 %3, f472, f494;
add.f32 %2, f471, f493;
add.f32 %5, f470, f485;
sub.f32 %4, f469, f486;
add.f32 %7, f474, f499;
add.f32 %6, f473, f497;
sub.f32 %8, f467, f483;
sub.f32 %9, f468, f484;
sub.f32 %11, f472, f494;
sub.f32 %10, f471, f493;
sub.f32 %13, f470, f485;
add.f32 %12, f469, f486;
sub.f32 %15, f474, f499;
sub.f32 %14, f473, f497;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_4096), "l"(lut_sp_8_512), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<306, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1224>;
.reg .b32 r<36>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 15;
mov.u32 r3, %32;
add.s32 r4, r3, r2;
add.f32 f65, %35, %51;
sub.f32 f67, %35, %51;
add.f32 f1216, %36, %67;
sub.f32 f68, %36, %67;
add.f32 f69, %43, %59;
sub.f32 f71, %43, %59;
add.f32 f1214, %68, %60;
sub.f32 f72, %68, %60;
add.f32 f73, f65, f69;
sub.f32 f75, f65, f69;
add.f32 f1213, f1216, f1214;
sub.f32 f76, f1216, f1214;
sub.f32 f77, f67, f72;
add.f32 f79, f67, f72;
add.f32 f1212, f68, f71;
sub.f32 f80, f68, f71;
add.f32 f81, %39, %55;
sub.f32 f83, %39, %55;
add.f32 f1209, %70, %69;
sub.f32 f84, %70, %69;
add.f32 f85, %47, %63;
sub.f32 f87, %47, %63;
add.f32 f1207, %48, %71;
sub.f32 f88, %48, %71;
add.f32 f89, f81, f85;
sub.f32 f91, f81, f85;
add.f32 f1206, f1209, f1207;
sub.f32 f92, f1209, f1207;
sub.f32 f93, f83, f88;
add.f32 f95, f83, f88;
add.f32 f1205, f84, f87;
sub.f32 f96, f84, f87;
mul.f32 f97, f93, 0f3F3504F3;
mul.f32 f98, f1205, 0f3F3504F3;
sub.f32 f99, f97, f98;
add.f32 f100, f97, f98;
mul.f32 f1203, f95, 0fBF3504F3;
mul.f32 f1204, f96, 0f3F3504F3;
sub.f32 f103, f1203, f1204;
mul.f32 f104, f96, 0fBF3504F3;
fma.rn.f32 f105, f95, 0f3F3504F3, f104;
add.f32 f106, f73, f89;
sub.f32 f108, f73, f89;
add.f32 f1202, f1213, f1206;
sub.f32 f109, f1213, f1206;
add.f32 f110, f77, f99;
sub.f32 f112, f77, f99;
add.f32 f1201, f1212, f100;
sub.f32 f113, f1212, f100;
sub.f32 f114, f75, f92;
add.f32 f116, f75, f92;
add.f32 f1200, f76, f91;
sub.f32 f117, f76, f91;
add.f32 f118, f79, f103;
sub.f32 f120, f79, f103;
add.f32 f1199, f80, f105;
sub.f32 f121, f80, f105;
add.f32 f122, %37, %53;
sub.f32 f124, %37, %53;
add.f32 f1197, %72, %54;
sub.f32 f125, %72, %54;
add.f32 f126, %45, %61;
sub.f32 f128, %45, %61;
add.f32 f1194, %73, %74;
sub.f32 f129, %73, %74;
add.f32 f130, f122, f126;
sub.f32 f132, f122, f126;
add.f32 f1193, f1197, f1194;
sub.f32 f133, f1197, f1194;
sub.f32 f134, f124, f129;
add.f32 f136, f124, f129;
add.f32 f1192, f125, f128;
sub.f32 f137, f125, f128;
add.f32 f138, %41, %57;
sub.f32 f140, %41, %57;
add.f32 f1190, %42, %75;
sub.f32 f141, %42, %75;
add.f32 f142, %49, %65;
sub.f32 f144, %49, %65;
add.f32 f1188, %76, %66;
sub.f32 f145, %76, %66;
add.f32 f146, f138, f142;
sub.f32 f148, f138, f142;
add.f32 f1187, f1190, f1188;
sub.f32 f149, f1190, f1188;
sub.f32 f150, f140, f145;
add.f32 f152, f140, f145;
add.f32 f1186, f141, f144;
sub.f32 f153, f141, f144;
mul.f32 f154, f150, 0f3F3504F3;
mul.f32 f155, f1186, 0f3F3504F3;
sub.f32 f156, f154, f155;
add.f32 f157, f154, f155;
mul.f32 f1184, f152, 0fBF3504F3;
mul.f32 f1185, f153, 0f3F3504F3;
sub.f32 f160, f1184, f1185;
mul.f32 f161, f153, 0fBF3504F3;
fma.rn.f32 f162, f152, 0f3F3504F3, f161;
add.f32 f163, f130, f146;
sub.f32 f165, f130, f146;
add.f32 f1183, f1193, f1187;
sub.f32 f166, f1193, f1187;
add.f32 f167, f134, f156;
sub.f32 f169, f134, f156;
add.f32 f1182, f1192, f157;
sub.f32 f170, f1192, f157;
sub.f32 f171, f132, f149;
add.f32 f173, f132, f149;
add.f32 f1181, f133, f148;
sub.f32 f174, f133, f148;
add.f32 f175, f136, f160;
sub.f32 f177, f136, f160;
add.f32 f1180, f137, f162;
sub.f32 f178, f137, f162;
mul.f32 f1178, f167, 0f3F6C835E;
mul.f32 f1179, f1182, 0f3EC3EF15;
sub.f32 f181, f1178, f1179;
mul.f32 f182, f1182, 0f3F6C835E;
fma.rn.f32 f183, f167, 0f3EC3EF15, f182;
mul.f32 f184, f171, 0f3F3504F3;
mul.f32 f185, f1181, 0f3F3504F3;
sub.f32 f186, f184, f185;
add.f32 f187, f184, f185;
mul.f32 f189, f1180, 0f3F6C835E;
mul.f32 f1177, f175, 0f3EC3EF15;
sub.f32 f190, f1177, f189;
mul.f32 f191, f1180, 0f3EC3EF15;
fma.rn.f32 f192, f175, 0f3F6C835E, f191;
mul.f32 f194, f170, 0f3F6C835E;
mul.f32 f1176, f169, 0fBEC3EF15;
sub.f32 f195, f1176, f194;
mul.f32 f196, f170, 0fBEC3EF15;
fma.rn.f32 f197, f169, 0f3F6C835E, f196;
mul.f32 f1174, f173, 0fBF3504F3;
mul.f32 f1175, f174, 0f3F3504F3;
sub.f32 f200, f1174, f1175;
mul.f32 f201, f174, 0fBF3504F3;
fma.rn.f32 f202, f173, 0f3F3504F3, f201;
mul.f32 f1172, f177, 0fBF6C835E;
mul.f32 f1173, f178, 0f3EC3EF15;
sub.f32 f205, f1172, f1173;
mul.f32 f206, f178, 0fBF6C835E;
fma.rn.f32 f207, f177, 0f3EC3EF15, f206;
add.f32 f210, f110, f181;
sub.f32 f212, f110, f181;
add.f32 f1171, f1201, f183;
sub.f32 f213, f1201, f183;
add.f32 f214, f114, f186;
sub.f32 f216, f114, f186;
add.f32 f1170, f1200, f187;
sub.f32 f217, f1200, f187;
add.f32 f218, f118, f190;
sub.f32 f220, f118, f190;
add.f32 f1169, f1199, f192;
sub.f32 f221, f1199, f192;
sub.f32 f222, f108, f166;
add.f32 f224, f108, f166;
add.f32 f1168, f109, f165;
sub.f32 f225, f109, f165;
add.f32 f226, f112, f195;
sub.f32 f228, f112, f195;
add.f32 f1167, f113, f197;
sub.f32 f229, f113, f197;
add.f32 f230, f116, f200;
sub.f32 f232, f116, f200;
add.f32 f1166, f117, f202;
sub.f32 f233, f117, f202;
add.f32 f234, f120, f205;
sub.f32 f236, f120, f205;
add.f32 f1165, f121, f207;
sub.f32 f237, f121, f207;
mov.u32 r22, %tid.x;
shl.b32 r7, r22, 7;
and.b32 r8, r7, -32768;
add.s32 r9, r4, r8;
shl.b32 r10, r22, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 2040;
mov.u64 rd4, %33;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f238, f239}, [rd5];
mul.f32 f242, f1171, f239;
mul.f32 f244, f238, f1171;
mul.f32 f246, f239, f239;
mul.f32 f1164, f238, f238;
sub.f32 f247, f1164, f246;
mul.f32 f248, f239, f238;
fma.rn.f32 f249, f239, f238, f248;
mul.f32 f250, f1170, f249;
mul.f32 f252, f247, f1170;
mul.f32 f1162, f238, f247;
mul.f32 f1163, f239, f249;
sub.f32 f255, f1162, f1163;
mul.f32 f1161, f214, f249;
mul.f32 f256, f238, f249;
fma.rn.f32 f257, f239, f247, f256;
mul.f32 f258, f1169, f257;
mul.f32 f260, f255, f1169;
mul.f32 f262, f239, f257;
mul.f32 f1160, f238, f255;
sub.f32 f263, f1160, f262;
mul.f32 f1159, f218, f257;
mul.f32 f264, f238, f257;
fma.rn.f32 f265, f239, f255, f264;
mul.f32 f266, f1168, f265;
mul.f32 f268, f263, f1168;
mul.f32 f270, f239, f265;
mul.f32 f1158, f238, f263;
sub.f32 f271, f1158, f270;
mul.f32 f1157, f222, f265;
mul.f32 f272, f238, f265;
fma.rn.f32 f273, f239, f263, f272;
mul.f32 f274, f1167, f273;
mul.f32 f276, f271, f1167;
mul.f32 f1155, f238, f271;
mul.f32 f1156, f239, f273;
sub.f32 f279, f1155, f1156;
mul.f32 f1154, f226, f273;
mul.f32 f280, f238, f273;
fma.rn.f32 f281, f239, f271, f280;
mul.f32 f282, f1166, f281;
mul.f32 f284, f279, f1166;
mul.f32 f286, f239, f281;
mul.f32 f1153, f238, f279;
sub.f32 f287, f1153, f286;
mul.f32 f1152, f230, f281;
mul.f32 f288, f238, f281;
fma.rn.f32 f289, f239, f279, f288;
mul.f32 f290, f1165, f289;
mul.f32 f292, f287, f1165;
mul.f32 f294, f239, f289;
mul.f32 f1151, f238, f287;
sub.f32 f295, f1151, f294;
mul.f32 f1150, f234, f289;
mul.f32 f296, f238, f289;
fma.rn.f32 f297, f239, f287, f296;
sub.f32 f1149, f1202, f1183;
mul.f32 f298, f1149, f297;
mul.f32 f300, f295, f1149;
mul.f32 f1147, f238, f295;
mul.f32 f1148, f239, f297;
sub.f32 f303, f1147, f1148;
sub.f32 f1146, f106, f163;
mul.f32 f1145, f1146, f297;
mul.f32 f304, f238, f297;
fma.rn.f32 f305, f239, f295, f304;
mul.f32 f306, f213, f305;
mul.f32 f308, f303, f213;
mul.f32 f310, f239, f305;
mul.f32 f1144, f238, f303;
sub.f32 f311, f1144, f310;
mul.f32 f1143, f212, f305;
mul.f32 f312, f238, f305;
fma.rn.f32 f313, f239, f303, f312;
mul.f32 f314, f217, f313;
mul.f32 f316, f311, f217;
mul.f32 f1141, f238, f311;
mul.f32 f1142, f239, f313;
sub.f32 f319, f1141, f1142;
mul.f32 f1140, f216, f313;
mul.f32 f320, f238, f313;
fma.rn.f32 f321, f239, f311, f320;
mul.f32 f322, f221, f321;
mul.f32 f324, f319, f221;
mul.f32 f326, f239, f321;
mul.f32 f1139, f238, f319;
sub.f32 f327, f1139, f326;
mul.f32 f1138, f220, f321;
mul.f32 f328, f238, f321;
fma.rn.f32 f329, f239, f319, f328;
mul.f32 f330, f225, f329;
mul.f32 f332, f327, f225;
mul.f32 f334, f239, f329;
mul.f32 f1137, f238, f327;
sub.f32 f335, f1137, f334;
mul.f32 f1136, f224, f329;
mul.f32 f336, f238, f329;
fma.rn.f32 f337, f239, f327, f336;
mul.f32 f338, f229, f337;
mul.f32 f340, f335, f229;
mul.f32 f1134, f238, f335;
mul.f32 f1135, f239, f337;
sub.f32 f343, f1134, f1135;
mul.f32 f1133, f228, f337;
mul.f32 f344, f238, f337;
fma.rn.f32 f345, f239, f335, f344;
mul.f32 f346, f233, f345;
mul.f32 f348, f343, f233;
mul.f32 f350, f239, f345;
mul.f32 f1132, f238, f343;
sub.f32 f351, f1132, f350;
mul.f32 f1131, f232, f345;
mul.f32 f352, f238, f345;
mul.f32 f1130, f210, f239;
fma.rn.f32 f353, f239, f343, f352;
mul.f32 f354, f237, f353;
mul.f32 f355, f236, f353;
mul.f32 f356, f351, f237;
barrier.sync 0;
and.b32 r11, r7, 32640;
add.s32 r12, r9, r11;
add.f32 f357, f1202, f1183;
sub.f32 f1218, f106, f163;
add.f32 f358, f106, f163;
mov.u32 r32, %tid.x;
shl.b32 r27, r32, 7;
shl.b32 r26, r32, 3;
fma.rn.f32 f359, f238, f210, f242;
sub.f32 f360, f244, f1130;
st.shared.v4.f32 [r12], {f358, f357, f359, f360};
fma.rn.f32 f361, f247, f214, f250;
sub.f32 f362, f252, f1161;
fma.rn.f32 f363, f255, f218, f258;
sub.f32 f364, f260, f1159;
st.shared.v4.f32 [r12+16], {f361, f362, f363, f364};
sub.f32 f365, f268, f1157;
fma.rn.f32 f366, f263, f222, f266;
fma.rn.f32 f367, f271, f226, f274;
sub.f32 f368, f276, f1154;
st.shared.v4.f32 [r12+32], {f366, f365, f367, f368};
fma.rn.f32 f369, f279, f230, f282;
sub.f32 f370, f284, f1152;
fma.rn.f32 f371, f287, f234, f290;
sub.f32 f372, f292, f1150;
st.shared.v4.f32 [r12+48], {f369, f370, f371, f372};
fma.rn.f32 f373, f295, f1218, f298;
sub.f32 f374, f300, f1145;
fma.rn.f32 f375, f303, f212, f306;
sub.f32 f376, f308, f1143;
st.shared.v4.f32 [r12+64], {f373, f374, f375, f376};
fma.rn.f32 f377, f311, f216, f314;
sub.f32 f378, f316, f1140;
fma.rn.f32 f379, f319, f220, f322;
sub.f32 f380, f324, f1138;
st.shared.v4.f32 [r12+80], {f377, f378, f379, f380};
fma.rn.f32 f381, f327, f224, f330;
sub.f32 f382, f332, f1136;
fma.rn.f32 f383, f335, f228, f338;
sub.f32 f384, f340, f1133;
st.shared.v4.f32 [r12+96], {f381, f382, f383, f384};
fma.rn.f32 f385, f343, f232, f346;
sub.f32 f386, f348, f1131;
fma.rn.f32 f387, f351, f236, f354;
sub.f32 f388, f356, f355;
st.shared.v4.f32 [r12+112], {f385, f386, f387, f388};
barrier.sync 0;
and.b32 r21, r32, 255;
mad.lo.s32 r13, r21, -120, r12;
ld.shared.v2.f32 {f389, f390}, [r13];
ld.shared.v2.f32 {f393, f394}, [r13+2048];
ld.shared.v2.f32 {f397, f398}, [r13+4096];
ld.shared.v2.f32 {f401, f402}, [r13+6144];
ld.shared.v2.f32 {f405, f406}, [r13+8192];
ld.shared.v2.f32 {f409, f410}, [r13+10240];
ld.shared.v2.f32 {f413, f414}, [r13+12288];
ld.shared.v2.f32 {f417, f418}, [r13+14336];
ld.shared.v2.f32 {f421, f422}, [r13+16384];
ld.shared.v2.f32 {f425, f426}, [r13+18432];
ld.shared.v2.f32 {f429, f430}, [r13+20480];
ld.shared.v2.f32 {f433, f434}, [r13+22528];
ld.shared.v2.f32 {f437, f438}, [r13+24576];
ld.shared.v2.f32 {f441, f442}, [r13+26624];
ld.shared.v2.f32 {f445, f446}, [r13+28672];
ld.shared.v2.f32 {f449, f450}, [r13+30720];
add.f32 f453, f389, f421;
sub.f32 f455, f389, f421;
add.f32 f1129, f390, f422;
sub.f32 f456, f390, f422;
add.f32 f457, f405, f437;
sub.f32 f459, f405, f437;
add.f32 f1128, f406, f438;
sub.f32 f460, f406, f438;
add.f32 f461, f453, f457;
sub.f32 f463, f453, f457;
add.f32 f1127, f1129, f1128;
sub.f32 f464, f1129, f1128;
sub.f32 f465, f455, f460;
add.f32 f467, f455, f460;
add.f32 f1126, f456, f459;
sub.f32 f468, f456, f459;
add.f32 f469, f397, f429;
sub.f32 f471, f397, f429;
add.f32 f1125, f398, f430;
sub.f32 f472, f398, f430;
add.f32 f473, f413, f445;
sub.f32 f475, f413, f445;
add.f32 f1124, f414, f446;
sub.f32 f476, f414, f446;
add.f32 f477, f469, f473;
sub.f32 f479, f469, f473;
add.f32 f1123, f1125, f1124;
sub.f32 f480, f1125, f1124;
sub.f32 f481, f471, f476;
add.f32 f483, f471, f476;
add.f32 f1122, f472, f475;
sub.f32 f484, f472, f475;
mul.f32 f485, f481, 0f3F3504F3;
mul.f32 f486, f1122, 0f3F3504F3;
sub.f32 f487, f485, f486;
add.f32 f488, f485, f486;
mul.f32 f490, f484, 0f3F3504F3;
mul.f32 f1121, f483, 0fBF3504F3;
sub.f32 f491, f1121, f490;
mul.f32 f492, f484, 0fBF3504F3;
fma.rn.f32 f493, f483, 0f3F3504F3, f492;
add.f32 f494, f461, f477;
sub.f32 f496, f461, f477;
add.f32 f1120, f1127, f1123;
sub.f32 f497, f1127, f1123;
add.f32 f498, f465, f487;
sub.f32 f500, f465, f487;
add.f32 f1119, f1126, f488;
sub.f32 f501, f1126, f488;
sub.f32 f502, f463, f480;
add.f32 f504, f463, f480;
add.f32 f1118, f464, f479;
sub.f32 f505, f464, f479;
add.f32 f506, f467, f491;
sub.f32 f508, f467, f491;
add.f32 f1117, f468, f493;
sub.f32 f509, f468, f493;
add.f32 f510, f393, f425;
sub.f32 f512, f393, f425;
add.f32 f1116, f394, f426;
sub.f32 f513, f394, f426;
add.f32 f514, f409, f441;
sub.f32 f516, f409, f441;
add.f32 f1115, f410, f442;
sub.f32 f517, f410, f442;
add.f32 f518, f510, f514;
sub.f32 f520, f510, f514;
add.f32 f1114, f1116, f1115;
sub.f32 f521, f1116, f1115;
sub.f32 f522, f512, f517;
add.f32 f524, f512, f517;
add.f32 f1113, f513, f516;
sub.f32 f525, f513, f516;
add.f32 f526, f401, f433;
sub.f32 f528, f401, f433;
add.f32 f1112, f402, f434;
sub.f32 f529, f402, f434;
add.f32 f530, f417, f449;
sub.f32 f532, f417, f449;
add.f32 f1111, f418, f450;
sub.f32 f533, f418, f450;
add.f32 f534, f526, f530;
sub.f32 f536, f526, f530;
add.f32 f1110, f1112, f1111;
sub.f32 f537, f1112, f1111;
sub.f32 f538, f528, f533;
add.f32 f540, f528, f533;
add.f32 f1109, f529, f532;
sub.f32 f541, f529, f532;
mul.f32 f542, f538, 0f3F3504F3;
mul.f32 f543, f1109, 0f3F3504F3;
sub.f32 f544, f542, f543;
add.f32 f545, f542, f543;
mul.f32 f547, f541, 0f3F3504F3;
mul.f32 f1108, f540, 0fBF3504F3;
sub.f32 f548, f1108, f547;
mul.f32 f549, f541, 0fBF3504F3;
fma.rn.f32 f550, f540, 0f3F3504F3, f549;
add.f32 f551, f518, f534;
sub.f32 f553, f518, f534;
add.f32 f1107, f1114, f1110;
sub.f32 f554, f1114, f1110;
add.f32 f555, f522, f544;
sub.f32 f557, f522, f544;
add.f32 f1106, f1113, f545;
sub.f32 f558, f1113, f545;
sub.f32 f559, f520, f537;
add.f32 f561, f520, f537;
add.f32 f1105, f521, f536;
sub.f32 f562, f521, f536;
add.f32 f563, f524, f548;
sub.f32 f565, f524, f548;
add.f32 f1104, f525, f550;
sub.f32 f566, f525, f550;
mul.f32 f568, f1106, 0f3EC3EF15;
mul.f32 f1103, f555, 0f3F6C835E;
sub.f32 f569, f1103, f568;
mul.f32 f570, f1106, 0f3F6C835E;
fma.rn.f32 f571, f555, 0f3EC3EF15, f570;
mul.f32 f572, f559, 0f3F3504F3;
mul.f32 f573, f1105, 0f3F3504F3;
sub.f32 f574, f572, f573;
add.f32 f575, f572, f573;
mul.f32 f577, f1104, 0f3F6C835E;
mul.f32 f1102, f563, 0f3EC3EF15;
sub.f32 f578, f1102, f577;
mul.f32 f579, f1104, 0f3EC3EF15;
fma.rn.f32 f580, f563, 0f3F6C835E, f579;
mul.f32 f582, f558, 0f3F6C835E;
mul.f32 f1101, f557, 0fBEC3EF15;
sub.f32 f583, f1101, f582;
mul.f32 f584, f558, 0fBEC3EF15;
fma.rn.f32 f585, f557, 0f3F6C835E, f584;
mul.f32 f587, f562, 0f3F3504F3;
mul.f32 f1100, f561, 0fBF3504F3;
sub.f32 f588, f1100, f587;
mul.f32 f589, f562, 0fBF3504F3;
fma.rn.f32 f590, f561, 0f3F3504F3, f589;
mul.f32 f592, f566, 0f3EC3EF15;
mul.f32 f1099, f565, 0fBF6C835E;
sub.f32 f593, f1099, f592;
mul.f32 f594, f566, 0fBF6C835E;
fma.rn.f32 f595, f565, 0f3EC3EF15, f594;
add.f32 f598, f498, f569;
sub.f32 f600, f498, f569;
add.f32 f1098, f1119, f571;
sub.f32 f601, f1119, f571;
add.f32 f602, f502, f574;
sub.f32 f604, f502, f574;
add.f32 f1097, f1118, f575;
sub.f32 f605, f1118, f575;
add.f32 f606, f506, f578;
sub.f32 f608, f506, f578;
add.f32 f1096, f1117, f580;
sub.f32 f609, f1117, f580;
sub.f32 f610, f496, f554;
add.f32 f612, f496, f554;
add.f32 f1095, f497, f553;
sub.f32 f613, f497, f553;
add.f32 f614, f500, f583;
sub.f32 f616, f500, f583;
add.f32 f1094, f501, f585;
sub.f32 f617, f501, f585;
add.f32 f618, f504, f588;
sub.f32 f620, f504, f588;
add.f32 f1093, f505, f590;
sub.f32 f621, f505, f590;
add.f32 f622, f508, f593;
sub.f32 f624, f508, f593;
add.f32 f1092, f509, f595;
sub.f32 f625, f509, f595;
and.b32 r14, r32, 240;
bfe.u32 r15, r32, 4, 4;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %34;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f626, f627}, [rd8];
mul.f32 f630, f1098, f627;
mul.f32 f632, f626, f1098;
mul.f32 f634, f627, f627;
mul.f32 f1091, f626, f626;
sub.f32 f635, f1091, f634;
mul.f32 f636, f627, f626;
fma.rn.f32 f637, f627, f626, f636;
mul.f32 f638, f1097, f637;
mul.f32 f640, f635, f1097;
mul.f32 f1089, f626, f635;
mul.f32 f1090, f627, f637;
sub.f32 f643, f1089, f1090;
mul.f32 f1088, f602, f637;
mul.f32 f644, f626, f637;
fma.rn.f32 f645, f627, f635, f644;
mul.f32 f646, f1096, f645;
mul.f32 f648, f643, f1096;
mul.f32 f650, f627, f645;
mul.f32 f1087, f626, f643;
sub.f32 f651, f1087, f650;
mul.f32 f1086, f606, f645;
mul.f32 f652, f626, f645;
fma.rn.f32 f653, f627, f643, f652;
mul.f32 f654, f1095, f653;
mul.f32 f656, f651, f1095;
mul.f32 f658, f627, f653;
mul.f32 f1085, f626, f651;
sub.f32 f659, f1085, f658;
mul.f32 f1084, f610, f653;
mul.f32 f660, f626, f653;
fma.rn.f32 f661, f627, f651, f660;
mul.f32 f662, f1094, f661;
mul.f32 f664, f659, f1094;
mul.f32 f1082, f626, f659;
mul.f32 f1083, f627, f661;
sub.f32 f667, f1082, f1083;
mul.f32 f1081, f614, f661;
mul.f32 f668, f626, f661;
fma.rn.f32 f669, f627, f659, f668;
mul.f32 f670, f1093, f669;
mul.f32 f672, f667, f1093;
mul.f32 f674, f627, f669;
mul.f32 f1080, f626, f667;
sub.f32 f675, f1080, f674;
mul.f32 f1079, f618, f669;
mul.f32 f676, f626, f669;
fma.rn.f32 f677, f627, f667, f676;
mul.f32 f678, f1092, f677;
mul.f32 f680, f675, f1092;
mul.f32 f682, f627, f677;
mul.f32 f1078, f626, f675;
sub.f32 f683, f1078, f682;
mul.f32 f1077, f622, f677;
mul.f32 f684, f626, f677;
fma.rn.f32 f685, f627, f675, f684;
sub.f32 f1076, f1120, f1107;
mul.f32 f686, f1076, f685;
mul.f32 f688, f683, f1076;
mul.f32 f1074, f626, f683;
mul.f32 f1075, f627, f685;
sub.f32 f691, f1074, f1075;
sub.f32 f1073, f494, f551;
mul.f32 f1072, f1073, f685;
mul.f32 f692, f626, f685;
fma.rn.f32 f693, f627, f683, f692;
mul.f32 f694, f601, f693;
mul.f32 f696, f691, f601;
mul.f32 f698, f627, f693;
mul.f32 f1071, f626, f691;
sub.f32 f699, f1071, f698;
mul.f32 f1070, f600, f693;
mul.f32 f700, f626, f693;
fma.rn.f32 f701, f627, f691, f700;
mul.f32 f702, f605, f701;
mul.f32 f704, f699, f605;
mul.f32 f1068, f626, f699;
mul.f32 f1069, f627, f701;
sub.f32 f707, f1068, f1069;
mul.f32 f1067, f604, f701;
mul.f32 f708, f626, f701;
fma.rn.f32 f709, f627, f699, f708;
mul.f32 f710, f609, f709;
mul.f32 f712, f707, f609;
mul.f32 f714, f627, f709;
mul.f32 f1066, f626, f707;
sub.f32 f715, f1066, f714;
mul.f32 f1065, f608, f709;
mul.f32 f716, f626, f709;
fma.rn.f32 f717, f627, f707, f716;
mul.f32 f718, f613, f717;
mul.f32 f720, f715, f613;
mul.f32 f722, f627, f717;
mul.f32 f1064, f626, f715;
sub.f32 f723, f1064, f722;
mul.f32 f1063, f612, f717;
mul.f32 f724, f626, f717;
fma.rn.f32 f725, f627, f715, f724;
mul.f32 f726, f617, f725;
mul.f32 f728, f723, f617;
mul.f32 f1061, f626, f723;
mul.f32 f1062, f627, f725;
sub.f32 f731, f1061, f1062;
mul.f32 f1060, f616, f725;
mul.f32 f732, f626, f725;
fma.rn.f32 f733, f627, f723, f732;
mul.f32 f734, f621, f733;
mul.f32 f736, f731, f621;
mul.f32 f738, f627, f733;
mul.f32 f1059, f626, f731;
sub.f32 f739, f1059, f738;
mul.f32 f1058, f620, f733;
mul.f32 f740, f626, f733;
mul.f32 f1057, f598, f627;
fma.rn.f32 f741, f627, f731, f740;
mul.f32 f742, f625, f741;
mul.f32 f743, f624, f741;
mul.f32 f744, f739, f625;
and.b32 r16, r26, 120;
add.s32 r17, r9, r16;
sub.f32 f1220, f1120, f1107;
mul.f32 f1219, f683, f1220;
mov.u32 r31, %tid.x;
shl.b32 r30, r31, 7;
barrier.sync 0;
and.b32 r18, r30, 30720;
add.s32 r19, r17, r18;
sub.f32 f1222, f1120, f1107;
mul.f32 f1221, f683, f1222;
add.f32 f745, f1120, f1107;
mov.u32 r25, %tid.x;
and.b32 r24, r25, 240;
add.f32 f746, f494, f551;
st.shared.v2.f32 [r19], {f746, f745};
mov.u32 r29, %tid.x;
and.b32 r28, r29, 240;
fma.rn.f32 f747, f626, f598, f630;
sub.f32 f748, f632, f1057;
st.shared.v2.f32 [r19+128], {f747, f748};
fma.rn.f32 f749, f635, f602, f638;
sub.f32 f750, f640, f1088;
st.shared.v2.f32 [r19+256], {f749, f750};
fma.rn.f32 f751, f643, f606, f646;
sub.f32 f752, f648, f1086;
st.shared.v2.f32 [r19+384], {f751, f752};
fma.rn.f32 f753, f651, f610, f654;
sub.f32 f754, f656, f1084;
st.shared.v2.f32 [r19+512], {f753, f754};
sub.f32 f755, f664, f1081;
fma.rn.f32 f756, f659, f614, f662;
st.shared.v2.f32 [r19+640], {f756, f755};
fma.rn.f32 f757, f667, f618, f670;
sub.f32 f758, f672, f1079;
st.shared.v2.f32 [r19+768], {f757, f758};
fma.rn.f32 f759, f675, f622, f678;
sub.f32 f760, f680, f1077;
st.shared.v2.f32 [r19+896], {f759, f760};
fma.rn.f32 f761, f683, f1073, f686;
sub.f32 f762, f1221, f1072;
st.shared.v2.f32 [r19+1024], {f761, f762};
fma.rn.f32 f763, f691, f600, f694;
sub.f32 f764, f696, f1070;
st.shared.v2.f32 [r19+1152], {f763, f764};
fma.rn.f32 f765, f699, f604, f702;
sub.f32 f766, f704, f1067;
st.shared.v2.f32 [r19+1280], {f765, f766};
fma.rn.f32 f767, f707, f608, f710;
sub.f32 f768, f712, f1065;
st.shared.v2.f32 [r19+1408], {f767, f768};
fma.rn.f32 f769, f715, f612, f718;
sub.f32 f770, f720, f1063;
st.shared.v2.f32 [r19+1536], {f769, f770};
fma.rn.f32 f771, f723, f616, f726;
sub.f32 f772, f728, f1060;
st.shared.v2.f32 [r19+1664], {f771, f772};
fma.rn.f32 f773, f731, f620, f734;
sub.f32 f774, f736, f1058;
st.shared.v2.f32 [r19+1792], {f773, f774};
fma.rn.f32 f775, f739, f624, f742;
sub.f32 f776, f744, f743;
st.shared.v2.f32 [r19+1920], {f775, f776};
barrier.sync 0;
mad.lo.s32 r20, r28, -120, r19;
ld.shared.v2.f32 {f777, f778}, [r20];
ld.shared.v2.f32 {f781, f782}, [r20+2048];
ld.shared.v2.f32 {f785, f786}, [r20+4096];
ld.shared.v2.f32 {f789, f790}, [r20+6144];
ld.shared.v2.f32 {f793, f794}, [r20+8192];
ld.shared.v2.f32 {f797, f798}, [r20+10240];
ld.shared.v2.f32 {f801, f802}, [r20+12288];
ld.shared.v2.f32 {f805, f806}, [r20+14336];
ld.shared.v2.f32 {f809, f810}, [r20+16384];
ld.shared.v2.f32 {f813, f814}, [r20+18432];
ld.shared.v2.f32 {f817, f818}, [r20+20480];
ld.shared.v2.f32 {f821, f822}, [r20+22528];
ld.shared.v2.f32 {f825, f826}, [r20+24576];
ld.shared.v2.f32 {f829, f830}, [r20+26624];
ld.shared.v2.f32 {f833, f834}, [r20+28672];
ld.shared.v2.f32 {f837, f838}, [r20+30720];
add.f32 f841, f777, f809;
sub.f32 f843, f777, f809;
add.f32 f1056, f778, f810;
sub.f32 f844, f778, f810;
add.f32 f845, f793, f825;
sub.f32 f847, f793, f825;
add.f32 f1055, f794, f826;
sub.f32 f848, f794, f826;
add.f32 f849, f841, f845;
sub.f32 f851, f841, f845;
add.f32 f1054, f1056, f1055;
sub.f32 f852, f1056, f1055;
sub.f32 f853, f843, f848;
add.f32 f855, f843, f848;
add.f32 f1053, f844, f847;
sub.f32 f856, f844, f847;
add.f32 f857, f785, f817;
sub.f32 f859, f785, f817;
add.f32 f1052, f786, f818;
sub.f32 f860, f786, f818;
add.f32 f861, f801, f833;
sub.f32 f863, f801, f833;
add.f32 f1051, f802, f834;
sub.f32 f864, f802, f834;
add.f32 f865, f857, f861;
sub.f32 f867, f857, f861;
add.f32 f1050, f1052, f1051;
sub.f32 f868, f1052, f1051;
sub.f32 f869, f859, f864;
add.f32 f871, f859, f864;
add.f32 f1049, f860, f863;
sub.f32 f872, f860, f863;
mul.f32 f873, f869, 0f3F3504F3;
mul.f32 f874, f1049, 0f3F3504F3;
sub.f32 f875, f873, f874;
add.f32 f876, f873, f874;
mul.f32 f878, f872, 0f3F3504F3;
mul.f32 f1048, f871, 0fBF3504F3;
sub.f32 f879, f1048, f878;
mul.f32 f880, f872, 0fBF3504F3;
fma.rn.f32 f881, f871, 0f3F3504F3, f880;
add.f32 f882, f849, f865;
sub.f32 f884, f849, f865;
add.f32 f1047, f1054, f1050;
sub.f32 f885, f1054, f1050;
add.f32 f886, f853, f875;
sub.f32 f888, f853, f875;
add.f32 f1046, f1053, f876;
sub.f32 f889, f1053, f876;
sub.f32 f890, f851, f868;
add.f32 f892, f851, f868;
add.f32 f1045, f852, f867;
sub.f32 f893, f852, f867;
add.f32 f894, f855, f879;
sub.f32 f896, f855, f879;
add.f32 f1044, f856, f881;
sub.f32 f897, f856, f881;
add.f32 f898, f781, f813;
sub.f32 f900, f781, f813;
add.f32 f1043, f782, f814;
sub.f32 f901, f782, f814;
add.f32 f902, f797, f829;
sub.f32 f904, f797, f829;
add.f32 f1042, f798, f830;
sub.f32 f905, f798, f830;
add.f32 f906, f898, f902;
sub.f32 f908, f898, f902;
add.f32 f1041, f1043, f1042;
sub.f32 f909, f1043, f1042;
sub.f32 f910, f900, f905;
add.f32 f912, f900, f905;
add.f32 f1040, f901, f904;
sub.f32 f913, f901, f904;
add.f32 f914, f789, f821;
sub.f32 f916, f789, f821;
add.f32 f1039, f790, f822;
sub.f32 f917, f790, f822;
add.f32 f918, f805, f837;
sub.f32 f920, f805, f837;
add.f32 f1038, f806, f838;
sub.f32 f921, f806, f838;
add.f32 f922, f914, f918;
sub.f32 f924, f914, f918;
add.f32 f1037, f1039, f1038;
sub.f32 f925, f1039, f1038;
sub.f32 f926, f916, f921;
add.f32 f928, f916, f921;
add.f32 f1036, f917, f920;
sub.f32 f929, f917, f920;
mul.f32 f930, f926, 0f3F3504F3;
mul.f32 f931, f1036, 0f3F3504F3;
sub.f32 f932, f930, f931;
add.f32 f933, f930, f931;
mul.f32 f935, f929, 0f3F3504F3;
mul.f32 f1035, f928, 0fBF3504F3;
sub.f32 f936, f1035, f935;
mul.f32 f937, f929, 0fBF3504F3;
fma.rn.f32 f938, f928, 0f3F3504F3, f937;
add.f32 f939, f906, f922;
sub.f32 f941, f906, f922;
add.f32 f1034, f1041, f1037;
sub.f32 f942, f1041, f1037;
add.f32 f943, f910, f932;
sub.f32 f945, f910, f932;
add.f32 f1033, f1040, f933;
sub.f32 f946, f1040, f933;
sub.f32 f947, f908, f925;
add.f32 f949, f908, f925;
add.f32 f1032, f909, f924;
sub.f32 f950, f909, f924;
add.f32 f951, f912, f936;
sub.f32 f953, f912, f936;
add.f32 f1031, f913, f938;
sub.f32 f954, f913, f938;
mul.f32 f956, f1033, 0f3EC3EF15;
mul.f32 f1030, f943, 0f3F6C835E;
sub.f32 f957, f1030, f956;
mul.f32 f958, f1033, 0f3F6C835E;
fma.rn.f32 f959, f943, 0f3EC3EF15, f958;
mul.f32 f960, f947, 0f3F3504F3;
mul.f32 f961, f1032, 0f3F3504F3;
sub.f32 f962, f960, f961;
add.f32 f963, f960, f961;
mul.f32 f965, f1031, 0f3F6C835E;
mul.f32 f1029, f951, 0f3EC3EF15;
sub.f32 f966, f1029, f965;
mul.f32 f967, f1031, 0f3EC3EF15;
fma.rn.f32 f968, f951, 0f3F6C835E, f967;
mul.f32 f970, f946, 0f3F6C835E;
mul.f32 f1028, f945, 0fBEC3EF15;
sub.f32 f971, f1028, f970;
mul.f32 f972, f946, 0fBEC3EF15;
fma.rn.f32 f973, f945, 0f3F6C835E, f972;
mul.f32 f975, f950, 0f3F3504F3;
mul.f32 f1027, f949, 0fBF3504F3;
sub.f32 f976, f1027, f975;
mul.f32 f977, f950, 0fBF3504F3;
fma.rn.f32 f978, f949, 0f3F3504F3, f977;
mul.f32 f980, f954, 0f3EC3EF15;
mul.f32 f1026, f953, 0fBF6C835E;
sub.f32 f981, f1026, f980;
mul.f32 f982, f954, 0fBF6C835E;
fma.rn.f32 f983, f953, 0f3EC3EF15, f982;
add.f32 %0, f882, f939;
add.f32 %1, f1047, f1034;
add.f32 %2, f886, f957;
add.f32 %3, f1046, f959;
add.f32 %4, f890, f962;
add.f32 %5, f1045, f963;
add.f32 %6, f894, f966;
add.f32 %7, f1044, f968;
add.f32 %9, f885, f941;
sub.f32 %8, f884, f942;
add.f32 %11, f889, f973;
add.f32 %10, f888, f971;
add.f32 %12, f892, f976;
add.f32 %13, f893, f978;
add.f32 %14, f896, f981;
add.f32 %15, f897, f983;
sub.f32 %17, f1047, f1034;
sub.f32 %16, f882, f939;
sub.f32 %19, f1046, f959;
sub.f32 %18, f886, f957;
sub.f32 %21, f1045, f963;
sub.f32 %20, f890, f962;
sub.f32 %23, f1044, f968;
sub.f32 %22, f894, f966;
sub.f32 %25, f885, f941;
add.f32 %24, f884, f942;
sub.f32 %27, f889, f973;
sub.f32 %26, f888, f971;
sub.f32 %29, f893, f978;
sub.f32 %28, f892, f976;
sub.f32 %31, f897, f983;
sub.f32 %30, f896, f981;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_4096), "l"(lut_sp_16_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<307, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<2548>;
.reg .b32 r<40>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 15;
mov.u32 r3, %64;
add.s32 r4, r3, r2;
add.f32 f129, %67, %99;
sub.f32 f131, %67, %99;
add.f32 f2540, %68, %131;
sub.f32 f132, %68, %131;
add.f32 f133, %83, %115;
sub.f32 f135, %83, %115;
add.f32 f2538, %132, %116;
sub.f32 f136, %132, %116;
add.f32 f137, f129, f133;
sub.f32 f139, f129, f133;
add.f32 f2537, f2540, f2538;
sub.f32 f140, f2540, f2538;
sub.f32 f141, f131, f136;
add.f32 f143, f131, f136;
add.f32 f2536, f132, f135;
sub.f32 f144, f132, f135;
add.f32 f145, %75, %107;
sub.f32 f147, %75, %107;
add.f32 f2533, %133, %134;
sub.f32 f148, %133, %134;
add.f32 f149, %91, %123;
sub.f32 f151, %91, %123;
add.f32 f2531, %92, %135;
sub.f32 f152, %92, %135;
add.f32 f153, f145, f149;
sub.f32 f155, f145, f149;
add.f32 f2530, f2533, f2531;
sub.f32 f156, f2533, f2531;
sub.f32 f157, f147, f152;
add.f32 f159, f147, f152;
add.f32 f2529, f148, f151;
sub.f32 f160, f148, f151;
mul.f32 f161, f157, 0f3F3504F3;
mul.f32 f162, f2529, 0f3F3504F3;
sub.f32 f163, f161, f162;
add.f32 f164, f161, f162;
mul.f32 f2527, f159, 0fBF3504F3;
mul.f32 f2528, f160, 0f3F3504F3;
sub.f32 f167, f2527, f2528;
mul.f32 f168, f160, 0fBF3504F3;
fma.rn.f32 f169, f159, 0f3F3504F3, f168;
add.f32 f170, f137, f153;
sub.f32 f172, f137, f153;
add.f32 f2526, f2537, f2530;
sub.f32 f173, f2537, f2530;
add.f32 f174, f141, f163;
sub.f32 f176, f141, f163;
add.f32 f2525, f2536, f164;
sub.f32 f177, f2536, f164;
sub.f32 f178, f139, f156;
add.f32 f180, f139, f156;
add.f32 f2524, f140, f155;
sub.f32 f181, f140, f155;
add.f32 f182, f143, f167;
sub.f32 f184, f143, f167;
add.f32 f2523, f144, f169;
sub.f32 f185, f144, f169;
add.f32 f186, %71, %103;
sub.f32 f188, %71, %103;
add.f32 f2521, %136, %104;
sub.f32 f189, %136, %104;
add.f32 f190, %87, %119;
sub.f32 f192, %87, %119;
add.f32 f2518, %138, %137;
sub.f32 f193, %138, %137;
add.f32 f194, f186, f190;
sub.f32 f196, f186, f190;
add.f32 f2517, f2521, f2518;
sub.f32 f197, f2521, f2518;
sub.f32 f198, f188, f193;
add.f32 f200, f188, f193;
add.f32 f2516, f189, f192;
sub.f32 f201, f189, f192;
add.f32 f202, %79, %111;
sub.f32 f204, %79, %111;
add.f32 f2514, %80, %139;
sub.f32 f205, %80, %139;
add.f32 f206, %95, %127;
sub.f32 f208, %95, %127;
add.f32 f2512, %140, %128;
sub.f32 f209, %140, %128;
add.f32 f210, f202, f206;
sub.f32 f212, f202, f206;
add.f32 f2511, f2514, f2512;
sub.f32 f213, f2514, f2512;
sub.f32 f214, f204, f209;
add.f32 f216, f204, f209;
add.f32 f2510, f205, f208;
sub.f32 f217, f205, f208;
mul.f32 f218, f214, 0f3F3504F3;
mul.f32 f219, f2510, 0f3F3504F3;
sub.f32 f220, f218, f219;
add.f32 f221, f218, f219;
mul.f32 f2508, f216, 0fBF3504F3;
mul.f32 f2509, f217, 0f3F3504F3;
sub.f32 f224, f2508, f2509;
mul.f32 f225, f217, 0fBF3504F3;
fma.rn.f32 f226, f216, 0f3F3504F3, f225;
add.f32 f227, f194, f210;
sub.f32 f229, f194, f210;
add.f32 f2507, f2517, f2511;
sub.f32 f230, f2517, f2511;
add.f32 f231, f198, f220;
sub.f32 f233, f198, f220;
add.f32 f2506, f2516, f221;
sub.f32 f234, f2516, f221;
sub.f32 f235, f196, f213;
add.f32 f237, f196, f213;
add.f32 f2505, f197, f212;
sub.f32 f238, f197, f212;
add.f32 f239, f200, f224;
sub.f32 f241, f200, f224;
add.f32 f2504, f201, f226;
sub.f32 f242, f201, f226;
mul.f32 f2502, f231, 0f3F6C835E;
mul.f32 f2503, f2506, 0f3EC3EF15;
sub.f32 f245, f2502, f2503;
mul.f32 f246, f2506, 0f3F6C835E;
fma.rn.f32 f247, f231, 0f3EC3EF15, f246;
mul.f32 f248, f235, 0f3F3504F3;
mul.f32 f249, f2505, 0f3F3504F3;
sub.f32 f250, f248, f249;
add.f32 f251, f248, f249;
mul.f32 f253, f2504, 0f3F6C835E;
mul.f32 f2501, f239, 0f3EC3EF15;
sub.f32 f254, f2501, f253;
mul.f32 f255, f2504, 0f3EC3EF15;
fma.rn.f32 f256, f239, 0f3F6C835E, f255;
mul.f32 f258, f234, 0f3F6C835E;
mul.f32 f2500, f233, 0fBEC3EF15;
sub.f32 f259, f2500, f258;
mul.f32 f260, f234, 0fBEC3EF15;
fma.rn.f32 f261, f233, 0f3F6C835E, f260;
mul.f32 f2498, f237, 0fBF3504F3;
mul.f32 f2499, f238, 0f3F3504F3;
sub.f32 f264, f2498, f2499;
mul.f32 f265, f238, 0fBF3504F3;
fma.rn.f32 f266, f237, 0f3F3504F3, f265;
mul.f32 f2496, f241, 0fBF6C835E;
mul.f32 f2497, f242, 0f3EC3EF15;
sub.f32 f269, f2496, f2497;
mul.f32 f270, f242, 0fBF6C835E;
fma.rn.f32 f271, f241, 0f3EC3EF15, f270;
add.f32 f272, f170, f227;
sub.f32 f274, f170, f227;
add.f32 f2495, f2526, f2507;
sub.f32 f275, f2526, f2507;
add.f32 f276, f174, f245;
sub.f32 f278, f174, f245;
add.f32 f2494, f2525, f247;
sub.f32 f279, f2525, f247;
add.f32 f280, f178, f250;
sub.f32 f282, f178, f250;
add.f32 f2493, f2524, f251;
sub.f32 f283, f2524, f251;
add.f32 f284, f182, f254;
sub.f32 f286, f182, f254;
add.f32 f2492, f2523, f256;
sub.f32 f287, f2523, f256;
sub.f32 f288, f172, f230;
add.f32 f290, f172, f230;
add.f32 f2491, f173, f229;
sub.f32 f291, f173, f229;
add.f32 f292, f176, f259;
sub.f32 f294, f176, f259;
add.f32 f2490, f177, f261;
sub.f32 f295, f177, f261;
add.f32 f296, f180, f264;
sub.f32 f298, f180, f264;
add.f32 f2489, f181, f266;
sub.f32 f299, f181, f266;
add.f32 f300, f184, f269;
sub.f32 f302, f184, f269;
add.f32 f2488, f185, f271;
sub.f32 f303, f185, f271;
add.f32 f304, %69, %101;
sub.f32 f306, %69, %101;
add.f32 f2485, %142, %141;
sub.f32 f307, %142, %141;
add.f32 f308, %85, %117;
sub.f32 f310, %85, %117;
add.f32 f2483, %86, %143;
sub.f32 f311, %86, %143;
add.f32 f312, f304, f308;
sub.f32 f314, f304, f308;
add.f32 f2482, f2485, f2483;
sub.f32 f315, f2485, f2483;
sub.f32 f316, f306, f311;
add.f32 f318, f306, f311;
add.f32 f2481, f307, f310;
sub.f32 f319, f307, f310;
add.f32 f320, %77, %109;
sub.f32 f322, %77, %109;
add.f32 f2479, %144, %110;
sub.f32 f323, %144, %110;
add.f32 f324, %93, %125;
sub.f32 f326, %93, %125;
add.f32 f2476, %146, %145;
sub.f32 f327, %146, %145;
add.f32 f328, f320, f324;
sub.f32 f330, f320, f324;
add.f32 f2475, f2479, f2476;
sub.f32 f331, f2479, f2476;
sub.f32 f332, f322, f327;
add.f32 f334, f322, f327;
add.f32 f2474, f323, f326;
sub.f32 f335, f323, f326;
mul.f32 f336, f332, 0f3F3504F3;
mul.f32 f337, f2474, 0f3F3504F3;
sub.f32 f338, f336, f337;
add.f32 f339, f336, f337;
mul.f32 f341, f335, 0f3F3504F3;
mul.f32 f2473, f334, 0fBF3504F3;
sub.f32 f342, f2473, f341;
mul.f32 f343, f335, 0fBF3504F3;
fma.rn.f32 f344, f334, 0f3F3504F3, f343;
add.f32 f345, f312, f328;
sub.f32 f347, f312, f328;
add.f32 f2472, f2482, f2475;
sub.f32 f348, f2482, f2475;
add.f32 f349, f316, f338;
sub.f32 f351, f316, f338;
add.f32 f2471, f2481, f339;
sub.f32 f352, f2481, f339;
sub.f32 f353, f314, f331;
add.f32 f355, f314, f331;
add.f32 f2470, f315, f330;
sub.f32 f356, f315, f330;
add.f32 f357, f318, f342;
sub.f32 f359, f318, f342;
add.f32 f2469, f319, f344;
sub.f32 f360, f319, f344;
add.f32 f361, %73, %105;
sub.f32 f363, %73, %105;
add.f32 f2467, %74, %147;
sub.f32 f364, %74, %147;
add.f32 f365, %89, %121;
sub.f32 f367, %89, %121;
add.f32 f2465, %148, %122;
sub.f32 f368, %148, %122;
add.f32 f369, f361, f365;
sub.f32 f371, f361, f365;
add.f32 f2464, f2467, f2465;
sub.f32 f372, f2467, f2465;
sub.f32 f373, f363, f368;
add.f32 f375, f363, f368;
add.f32 f2463, f364, f367;
sub.f32 f376, f364, f367;
add.f32 f377, %81, %113;
sub.f32 f379, %81, %113;
add.f32 f2460, %149, %150;
sub.f32 f380, %149, %150;
add.f32 f381, %97, %129;
sub.f32 f383, %97, %129;
add.f32 f2459, %98, %130;
sub.f32 f384, %98, %130;
add.f32 f385, f377, f381;
sub.f32 f387, f377, f381;
add.f32 f2458, f2460, f2459;
sub.f32 f388, f2460, f2459;
sub.f32 f389, f379, f384;
add.f32 f391, f379, f384;
add.f32 f2457, f380, f383;
sub.f32 f392, f380, f383;
mul.f32 f393, f389, 0f3F3504F3;
mul.f32 f394, f2457, 0f3F3504F3;
sub.f32 f395, f393, f394;
add.f32 f396, f393, f394;
mul.f32 f2455, f391, 0fBF3504F3;
mul.f32 f2456, f392, 0f3F3504F3;
sub.f32 f399, f2455, f2456;
mul.f32 f400, f392, 0fBF3504F3;
fma.rn.f32 f401, f391, 0f3F3504F3, f400;
add.f32 f402, f369, f385;
sub.f32 f404, f369, f385;
add.f32 f2454, f2464, f2458;
sub.f32 f405, f2464, f2458;
add.f32 f406, f373, f395;
sub.f32 f408, f373, f395;
add.f32 f2453, f2463, f396;
sub.f32 f409, f2463, f396;
sub.f32 f410, f371, f388;
add.f32 f412, f371, f388;
add.f32 f2452, f372, f387;
sub.f32 f413, f372, f387;
add.f32 f414, f375, f399;
sub.f32 f416, f375, f399;
add.f32 f2451, f376, f401;
sub.f32 f417, f376, f401;
mul.f32 f419, f2453, 0f3EC3EF15;
mul.f32 f2450, f406, 0f3F6C835E;
sub.f32 f420, f2450, f419;
mul.f32 f421, f2453, 0f3F6C835E;
fma.rn.f32 f422, f406, 0f3EC3EF15, f421;
mul.f32 f423, f410, 0f3F3504F3;
mul.f32 f424, f2452, 0f3F3504F3;
sub.f32 f425, f423, f424;
add.f32 f426, f423, f424;
mul.f32 f428, f2451, 0f3F6C835E;
mul.f32 f2449, f414, 0f3EC3EF15;
sub.f32 f429, f2449, f428;
mul.f32 f430, f2451, 0f3EC3EF15;
fma.rn.f32 f431, f414, 0f3F6C835E, f430;
mul.f32 f433, f409, 0f3F6C835E;
mul.f32 f2448, f408, 0fBEC3EF15;
sub.f32 f434, f2448, f433;
mul.f32 f435, f409, 0fBEC3EF15;
fma.rn.f32 f436, f408, 0f3F6C835E, f435;
mul.f32 f438, f413, 0f3F3504F3;
mul.f32 f2447, f412, 0fBF3504F3;
sub.f32 f439, f2447, f438;
mul.f32 f440, f413, 0fBF3504F3;
fma.rn.f32 f441, f412, 0f3F3504F3, f440;
mul.f32 f443, f417, 0f3EC3EF15;
mul.f32 f2446, f416, 0fBF6C835E;
sub.f32 f444, f2446, f443;
mul.f32 f445, f417, 0fBF6C835E;
fma.rn.f32 f446, f416, 0f3EC3EF15, f445;
add.f32 f447, f345, f402;
sub.f32 f449, f345, f402;
add.f32 f2445, f2472, f2454;
sub.f32 f450, f2472, f2454;
add.f32 f451, f349, f420;
sub.f32 f453, f349, f420;
add.f32 f2444, f2471, f422;
sub.f32 f454, f2471, f422;
add.f32 f455, f353, f425;
sub.f32 f457, f353, f425;
add.f32 f2443, f2470, f426;
sub.f32 f458, f2470, f426;
add.f32 f459, f357, f429;
sub.f32 f461, f357, f429;
add.f32 f2442, f2469, f431;
sub.f32 f462, f2469, f431;
sub.f32 f463, f347, f405;
add.f32 f465, f347, f405;
add.f32 f2441, f348, f404;
sub.f32 f466, f348, f404;
add.f32 f467, f351, f434;
sub.f32 f469, f351, f434;
add.f32 f2440, f352, f436;
sub.f32 f470, f352, f436;
add.f32 f471, f355, f439;
sub.f32 f473, f355, f439;
add.f32 f2439, f356, f441;
sub.f32 f474, f356, f441;
add.f32 f475, f359, f444;
sub.f32 f477, f359, f444;
add.f32 f2438, f360, f446;
sub.f32 f478, f360, f446;
mul.f32 f480, f2444, 0f3E47C5C2;
mul.f32 f2437, f451, 0f3F7B14BE;
sub.f32 f481, f2437, f480;
mul.f32 f482, f2444, 0f3F7B14BE;
fma.rn.f32 f483, f451, 0f3E47C5C2, f482;
mul.f32 f485, f2443, 0f3EC3EF15;
mul.f32 f2436, f455, 0f3F6C835E;
sub.f32 f486, f2436, f485;
mul.f32 f487, f2443, 0f3F6C835E;
fma.rn.f32 f488, f455, 0f3EC3EF15, f487;
mul.f32 f490, f2442, 0f3F0E39DA;
mul.f32 f2435, f459, 0f3F54DB31;
sub.f32 f491, f2435, f490;
mul.f32 f492, f2442, 0f3F54DB31;
fma.rn.f32 f493, f459, 0f3F0E39DA, f492;
mul.f32 f494, f463, 0f3F3504F3;
mul.f32 f495, f2441, 0f3F3504F3;
sub.f32 f496, f494, f495;
add.f32 f497, f494, f495;
mul.f32 f499, f2440, 0f3F54DB31;
mul.f32 f2434, f467, 0f3F0E39DA;
sub.f32 f500, f2434, f499;
mul.f32 f501, f2440, 0f3F0E39DA;
fma.rn.f32 f502, f467, 0f3F54DB31, f501;
mul.f32 f504, f2439, 0f3F6C835E;
mul.f32 f2433, f471, 0f3EC3EF15;
sub.f32 f505, f2433, f504;
mul.f32 f506, f2439, 0f3EC3EF15;
fma.rn.f32 f507, f471, 0f3F6C835E, f506;
mul.f32 f509, f2438, 0f3F7B14BE;
mul.f32 f2432, f475, 0f3E47C5C2;
sub.f32 f510, f2432, f509;
mul.f32 f511, f2438, 0f3E47C5C2;
fma.rn.f32 f512, f475, 0f3F7B14BE, f511;
mul.f32 f514, f454, 0f3F7B14BE;
mul.f32 f2431, f453, 0fBE47C5C2;
sub.f32 f515, f2431, f514;
mul.f32 f516, f454, 0fBE47C5C2;
fma.rn.f32 f517, f453, 0f3F7B14BE, f516;
mul.f32 f2429, f457, 0fBEC3EF15;
mul.f32 f2430, f458, 0f3F6C835E;
sub.f32 f520, f2429, f2430;
mul.f32 f521, f458, 0fBEC3EF15;
fma.rn.f32 f522, f457, 0f3F6C835E, f521;
mul.f32 f2427, f461, 0fBF0E39DA;
mul.f32 f2428, f462, 0f3F54DB31;
sub.f32 f525, f2427, f2428;
mul.f32 f526, f462, 0fBF0E39DA;
fma.rn.f32 f527, f461, 0f3F54DB31, f526;
mul.f32 f2425, f465, 0fBF3504F3;
mul.f32 f2426, f466, 0f3F3504F3;
sub.f32 f530, f2425, f2426;
mul.f32 f531, f466, 0fBF3504F3;
fma.rn.f32 f532, f465, 0f3F3504F3, f531;
mul.f32 f2423, f469, 0fBF54DB31;
mul.f32 f2424, f470, 0f3F0E39DA;
sub.f32 f535, f2423, f2424;
mul.f32 f536, f470, 0fBF54DB31;
fma.rn.f32 f537, f469, 0f3F0E39DA, f536;
mul.f32 f539, f474, 0f3EC3EF15;
mul.f32 f2422, f473, 0fBF6C835E;
sub.f32 f540, f2422, f539;
mul.f32 f541, f474, 0fBF6C835E;
fma.rn.f32 f542, f473, 0f3EC3EF15, f541;
mul.f32 f544, f478, 0f3E47C5C2;
mul.f32 f2421, f477, 0fBF7B14BE;
sub.f32 f545, f2421, f544;
mul.f32 f546, f478, 0fBF7B14BE;
fma.rn.f32 f547, f477, 0f3E47C5C2, f546;
add.f32 f550, f276, f481;
sub.f32 f552, f276, f481;
add.f32 f2420, f2494, f483;
sub.f32 f553, f2494, f483;
add.f32 f554, f280, f486;
sub.f32 f556, f280, f486;
add.f32 f2419, f2493, f488;
sub.f32 f557, f2493, f488;
add.f32 f558, f284, f491;
sub.f32 f560, f284, f491;
add.f32 f2418, f2492, f493;
sub.f32 f561, f2492, f493;
add.f32 f562, f288, f496;
sub.f32 f564, f288, f496;
add.f32 f2417, f2491, f497;
sub.f32 f565, f2491, f497;
add.f32 f566, f292, f500;
sub.f32 f568, f292, f500;
add.f32 f2416, f2490, f502;
sub.f32 f569, f2490, f502;
add.f32 f570, f296, f505;
sub.f32 f572, f296, f505;
add.f32 f2415, f2489, f507;
sub.f32 f573, f2489, f507;
add.f32 f574, f300, f510;
sub.f32 f576, f300, f510;
add.f32 f2414, f2488, f512;
sub.f32 f577, f2488, f512;
sub.f32 f578, f274, f450;
add.f32 f580, f274, f450;
add.f32 f2413, f275, f449;
sub.f32 f581, f275, f449;
add.f32 f582, f278, f515;
sub.f32 f584, f278, f515;
add.f32 f2412, f279, f517;
sub.f32 f585, f279, f517;
add.f32 f586, f282, f520;
sub.f32 f588, f282, f520;
add.f32 f2411, f283, f522;
sub.f32 f589, f283, f522;
add.f32 f590, f286, f525;
sub.f32 f592, f286, f525;
add.f32 f2410, f287, f527;
sub.f32 f593, f287, f527;
add.f32 f594, f290, f530;
sub.f32 f596, f290, f530;
add.f32 f2409, f291, f532;
sub.f32 f597, f291, f532;
add.f32 f598, f294, f535;
sub.f32 f600, f294, f535;
add.f32 f2408, f295, f537;
sub.f32 f601, f295, f537;
add.f32 f602, f298, f540;
sub.f32 f604, f298, f540;
add.f32 f2407, f299, f542;
sub.f32 f605, f299, f542;
add.f32 f606, f302, f545;
sub.f32 f608, f302, f545;
add.f32 f2406, f303, f547;
sub.f32 f609, f303, f547;
mov.u32 r22, %tid.x;
shl.b32 r7, r22, 8;
and.b32 r8, r7, -32768;
add.s32 r9, r4, r8;
shl.b32 r10, r22, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 1016;
mov.u64 rd4, %65;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f610, f611}, [rd5];
mul.f32 f614, f2420, f611;
mul.f32 f616, f610, f2420;
mul.f32 f618, f611, f611;
mul.f32 f2405, f610, f610;
sub.f32 f619, f2405, f618;
mul.f32 f620, f611, f610;
fma.rn.f32 f621, f611, f610, f620;
mul.f32 f622, f2419, f621;
mul.f32 f624, f619, f2419;
mul.f32 f626, f611, f621;
mul.f32 f2404, f610, f619;
sub.f32 f627, f2404, f626;
mul.f32 f2403, f554, f621;
mul.f32 f628, f610, f621;
fma.rn.f32 f629, f611, f619, f628;
mul.f32 f630, f2418, f629;
mul.f32 f632, f627, f2418;
mul.f32 f2401, f610, f627;
mul.f32 f2402, f611, f629;
sub.f32 f635, f2401, f2402;
mul.f32 f2400, f558, f629;
mul.f32 f636, f610, f629;
fma.rn.f32 f637, f611, f627, f636;
mul.f32 f638, f2417, f637;
mul.f32 f640, f635, f2417;
mul.f32 f642, f611, f637;
mul.f32 f2399, f610, f635;
sub.f32 f643, f2399, f642;
mul.f32 f2398, f562, f637;
mul.f32 f644, f610, f637;
fma.rn.f32 f645, f611, f635, f644;
mul.f32 f646, f2416, f645;
mul.f32 f648, f643, f2416;
mul.f32 f2396, f610, f643;
mul.f32 f2397, f611, f645;
sub.f32 f651, f2396, f2397;
mul.f32 f2395, f566, f645;
mul.f32 f652, f610, f645;
fma.rn.f32 f653, f611, f643, f652;
mul.f32 f654, f2415, f653;
mul.f32 f656, f651, f2415;
mul.f32 f658, f611, f653;
mul.f32 f2394, f610, f651;
sub.f32 f659, f2394, f658;
mul.f32 f2393, f570, f653;
mul.f32 f660, f610, f653;
fma.rn.f32 f661, f611, f651, f660;
mul.f32 f662, f2414, f661;
mul.f32 f664, f659, f2414;
mul.f32 f666, f611, f661;
mul.f32 f2392, f610, f659;
sub.f32 f667, f2392, f666;
mul.f32 f2391, f574, f661;
mul.f32 f668, f610, f661;
fma.rn.f32 f669, f611, f659, f668;
mul.f32 f670, f2413, f669;
mul.f32 f672, f667, f2413;
mul.f32 f2389, f610, f667;
mul.f32 f2390, f611, f669;
sub.f32 f675, f2389, f2390;
mul.f32 f2388, f578, f669;
mul.f32 f676, f610, f669;
fma.rn.f32 f677, f611, f667, f676;
mul.f32 f678, f2412, f677;
mul.f32 f680, f675, f2412;
mul.f32 f682, f611, f677;
mul.f32 f2387, f610, f675;
sub.f32 f683, f2387, f682;
mul.f32 f2386, f582, f677;
mul.f32 f684, f610, f677;
fma.rn.f32 f685, f611, f675, f684;
mul.f32 f686, f2411, f685;
mul.f32 f688, f683, f2411;
mul.f32 f690, f611, f685;
mul.f32 f2385, f610, f683;
sub.f32 f691, f2385, f690;
mul.f32 f2384, f586, f685;
mul.f32 f692, f610, f685;
fma.rn.f32 f693, f611, f683, f692;
mul.f32 f694, f2410, f693;
mul.f32 f696, f691, f2410;
mul.f32 f2382, f610, f691;
mul.f32 f2383, f611, f693;
sub.f32 f699, f2382, f2383;
mul.f32 f2381, f590, f693;
mul.f32 f700, f610, f693;
fma.rn.f32 f701, f611, f691, f700;
mul.f32 f702, f2409, f701;
mul.f32 f704, f699, f2409;
mul.f32 f706, f611, f701;
mul.f32 f2380, f610, f699;
sub.f32 f707, f2380, f706;
mul.f32 f2379, f594, f701;
mul.f32 f708, f610, f701;
fma.rn.f32 f709, f611, f699, f708;
mul.f32 f710, f2408, f709;
mul.f32 f712, f707, f2408;
mul.f32 f2377, f610, f707;
mul.f32 f2378, f611, f709;
sub.f32 f715, f2377, f2378;
mul.f32 f2376, f598, f709;
mul.f32 f716, f610, f709;
fma.rn.f32 f717, f611, f707, f716;
mul.f32 f718, f2407, f717;
mul.f32 f720, f715, f2407;
mul.f32 f722, f611, f717;
mul.f32 f2375, f610, f715;
sub.f32 f723, f2375, f722;
mul.f32 f2374, f602, f717;
mul.f32 f724, f610, f717;
fma.rn.f32 f725, f611, f715, f724;
mul.f32 f726, f2406, f725;
mul.f32 f728, f723, f2406;
mul.f32 f730, f611, f725;
mul.f32 f2373, f610, f723;
sub.f32 f731, f2373, f730;
mul.f32 f2372, f606, f725;
mul.f32 f732, f610, f725;
fma.rn.f32 f733, f611, f723, f732;
sub.f32 f2371, f2495, f2445;
mul.f32 f734, f2371, f733;
mul.f32 f736, f731, f2371;
mul.f32 f2369, f610, f731;
mul.f32 f2370, f611, f733;
sub.f32 f739, f2369, f2370;
sub.f32 f2368, f272, f447;
mul.f32 f2367, f2368, f733;
mul.f32 f740, f610, f733;
fma.rn.f32 f741, f611, f731, f740;
mul.f32 f742, f553, f741;
mul.f32 f744, f739, f553;
mul.f32 f746, f611, f741;
mul.f32 f2366, f610, f739;
sub.f32 f747, f2366, f746;
mul.f32 f2365, f552, f741;
mul.f32 f748, f610, f741;
fma.rn.f32 f749, f611, f739, f748;
mul.f32 f750, f557, f749;
mul.f32 f752, f747, f557;
mul.f32 f754, f611, f749;
mul.f32 f2364, f610, f747;
sub.f32 f755, f2364, f754;
mul.f32 f2363, f556, f749;
mul.f32 f756, f610, f749;
fma.rn.f32 f757, f611, f747, f756;
mul.f32 f758, f561, f757;
mul.f32 f760, f755, f561;
mul.f32 f2361, f610, f755;
mul.f32 f2362, f611, f757;
sub.f32 f763, f2361, f2362;
mul.f32 f2360, f560, f757;
mul.f32 f764, f610, f757;
fma.rn.f32 f765, f611, f755, f764;
mul.f32 f766, f565, f765;
mul.f32 f768, f763, f565;
mul.f32 f770, f611, f765;
mul.f32 f2359, f610, f763;
sub.f32 f771, f2359, f770;
mul.f32 f2358, f564, f765;
mul.f32 f772, f610, f765;
fma.rn.f32 f773, f611, f763, f772;
mul.f32 f774, f569, f773;
mul.f32 f776, f771, f569;
mul.f32 f2356, f610, f771;
mul.f32 f2357, f611, f773;
sub.f32 f779, f2356, f2357;
mul.f32 f2355, f568, f773;
mul.f32 f780, f610, f773;
fma.rn.f32 f781, f611, f771, f780;
mul.f32 f782, f573, f781;
mul.f32 f784, f779, f573;
mul.f32 f786, f611, f781;
mul.f32 f2354, f610, f779;
sub.f32 f787, f2354, f786;
mul.f32 f2353, f572, f781;
mul.f32 f788, f610, f781;
fma.rn.f32 f789, f611, f779, f788;
mul.f32 f790, f577, f789;
mul.f32 f792, f787, f577;
mul.f32 f794, f611, f789;
mul.f32 f2352, f610, f787;
sub.f32 f795, f2352, f794;
mul.f32 f2351, f576, f789;
mul.f32 f796, f610, f789;
fma.rn.f32 f797, f611, f787, f796;
mul.f32 f798, f581, f797;
mul.f32 f800, f795, f581;
mul.f32 f2349, f610, f795;
mul.f32 f2350, f611, f797;
sub.f32 f803, f2349, f2350;
mul.f32 f2348, f580, f797;
mul.f32 f804, f610, f797;
fma.rn.f32 f805, f611, f795, f804;
mul.f32 f806, f585, f805;
mul.f32 f808, f803, f585;
mul.f32 f810, f611, f805;
mul.f32 f2347, f610, f803;
sub.f32 f811, f2347, f810;
mul.f32 f2346, f584, f805;
mul.f32 f812, f610, f805;
fma.rn.f32 f813, f611, f803, f812;
mul.f32 f814, f589, f813;
mul.f32 f816, f811, f589;
mul.f32 f818, f611, f813;
mul.f32 f2345, f610, f811;
sub.f32 f819, f2345, f818;
mul.f32 f2344, f588, f813;
mul.f32 f820, f610, f813;
fma.rn.f32 f821, f611, f811, f820;
mul.f32 f822, f593, f821;
mul.f32 f824, f819, f593;
mul.f32 f2342, f610, f819;
mul.f32 f2343, f611, f821;
sub.f32 f827, f2342, f2343;
mul.f32 f2341, f592, f821;
mul.f32 f828, f610, f821;
fma.rn.f32 f829, f611, f819, f828;
mul.f32 f830, f597, f829;
mul.f32 f832, f827, f597;
mul.f32 f834, f611, f829;
mul.f32 f2340, f610, f827;
sub.f32 f835, f2340, f834;
mul.f32 f2339, f596, f829;
mul.f32 f836, f610, f829;
fma.rn.f32 f837, f611, f827, f836;
mul.f32 f838, f601, f837;
mul.f32 f840, f835, f601;
mul.f32 f2337, f610, f835;
mul.f32 f2338, f611, f837;
sub.f32 f843, f2337, f2338;
mul.f32 f2336, f600, f837;
mul.f32 f844, f610, f837;
fma.rn.f32 f845, f611, f835, f844;
mul.f32 f846, f605, f845;
mul.f32 f848, f843, f605;
mul.f32 f850, f611, f845;
mul.f32 f2335, f610, f843;
sub.f32 f851, f2335, f850;
mul.f32 f2334, f604, f845;
mul.f32 f852, f610, f845;
mul.f32 f2333, f550, f611;
fma.rn.f32 f853, f611, f843, f852;
mul.f32 f854, f609, f853;
mul.f32 f855, f608, f853;
mul.f32 f856, f851, f609;
barrier.sync 0;
and.b32 r11, r7, 32512;
add.s32 r12, r9, r11;
add.f32 f857, f2495, f2445;
sub.f32 f2544, f272, f447;
add.f32 f858, f272, f447;
mov.u32 r39, %tid.x;
shl.b32 r35, r39, 3;
shl.b32 r27, r39, 8;
fma.rn.f32 f859, f610, f550, f614;
sub.f32 f860, f616, f2333;
st.shared.v4.f32 [r12], {f858, f857, f859, f860};
fma.rn.f32 f861, f619, f554, f622;
sub.f32 f862, f624, f2403;
fma.rn.f32 f863, f627, f558, f630;
sub.f32 f864, f632, f2400;
st.shared.v4.f32 [r12+16], {f861, f862, f863, f864};
fma.rn.f32 f865, f635, f562, f638;
sub.f32 f866, f640, f2398;
sub.f32 f867, f648, f2395;
fma.rn.f32 f868, f643, f566, f646;
st.shared.v4.f32 [r12+32], {f865, f866, f868, f867};
fma.rn.f32 f869, f651, f570, f654;
sub.f32 f870, f656, f2393;
fma.rn.f32 f871, f659, f574, f662;
sub.f32 f872, f664, f2391;
st.shared.v4.f32 [r12+48], {f869, f870, f871, f872};
fma.rn.f32 f873, f667, f578, f670;
sub.f32 f874, f672, f2388;
fma.rn.f32 f875, f675, f582, f678;
sub.f32 f876, f680, f2386;
st.shared.v4.f32 [r12+64], {f873, f874, f875, f876};
fma.rn.f32 f877, f683, f586, f686;
sub.f32 f878, f688, f2384;
fma.rn.f32 f879, f691, f590, f694;
sub.f32 f880, f696, f2381;
st.shared.v4.f32 [r12+80], {f877, f878, f879, f880};
fma.rn.f32 f881, f699, f594, f702;
sub.f32 f882, f704, f2379;
fma.rn.f32 f883, f707, f598, f710;
sub.f32 f884, f712, f2376;
st.shared.v4.f32 [r12+96], {f881, f882, f883, f884};
fma.rn.f32 f885, f715, f602, f718;
sub.f32 f886, f720, f2374;
fma.rn.f32 f887, f723, f606, f726;
sub.f32 f888, f728, f2372;
st.shared.v4.f32 [r12+112], {f885, f886, f887, f888};
fma.rn.f32 f889, f731, f2544, f734;
sub.f32 f890, f736, f2367;
fma.rn.f32 f891, f739, f552, f742;
sub.f32 f892, f744, f2365;
st.shared.v4.f32 [r12+128], {f889, f890, f891, f892};
fma.rn.f32 f893, f747, f556, f750;
sub.f32 f894, f752, f2363;
fma.rn.f32 f895, f755, f560, f758;
sub.f32 f896, f760, f2360;
st.shared.v4.f32 [r12+144], {f893, f894, f895, f896};
fma.rn.f32 f897, f763, f564, f766;
sub.f32 f898, f768, f2358;
fma.rn.f32 f899, f771, f568, f774;
sub.f32 f900, f776, f2355;
st.shared.v4.f32 [r12+160], {f897, f898, f899, f900};
fma.rn.f32 f901, f779, f572, f782;
sub.f32 f902, f784, f2353;
fma.rn.f32 f903, f787, f576, f790;
sub.f32 f904, f792, f2351;
st.shared.v4.f32 [r12+176], {f901, f902, f903, f904};
fma.rn.f32 f905, f795, f580, f798;
sub.f32 f906, f800, f2348;
fma.rn.f32 f907, f803, f584, f806;
sub.f32 f908, f808, f2346;
st.shared.v4.f32 [r12+192], {f905, f906, f907, f908};
fma.rn.f32 f909, f811, f588, f814;
sub.f32 f910, f816, f2344;
fma.rn.f32 f911, f819, f592, f822;
sub.f32 f912, f824, f2341;
st.shared.v4.f32 [r12+208], {f909, f910, f911, f912};
fma.rn.f32 f913, f827, f596, f830;
sub.f32 f914, f832, f2339;
fma.rn.f32 f915, f835, f600, f838;
sub.f32 f916, f840, f2336;
st.shared.v4.f32 [r12+224], {f913, f914, f915, f916};
fma.rn.f32 f917, f843, f604, f846;
sub.f32 f918, f848, f2334;
fma.rn.f32 f919, f851, f608, f854;
sub.f32 f920, f856, f855;
st.shared.v4.f32 [r12+240], {f917, f918, f919, f920};
barrier.sync 0;
and.b32 r21, r39, 127;
mad.lo.s32 r13, r21, -248, r12;
ld.shared.v2.f32 {f921, f922}, [r13];
ld.shared.v2.f32 {f925, f926}, [r13+1024];
ld.shared.v2.f32 {f929, f930}, [r13+2048];
ld.shared.v2.f32 {f933, f934}, [r13+3072];
ld.shared.v2.f32 {f937, f938}, [r13+4096];
ld.shared.v2.f32 {f941, f942}, [r13+5120];
ld.shared.v2.f32 {f945, f946}, [r13+6144];
ld.shared.v2.f32 {f949, f950}, [r13+7168];
ld.shared.v2.f32 {f953, f954}, [r13+8192];
ld.shared.v2.f32 {f957, f958}, [r13+9216];
ld.shared.v2.f32 {f961, f962}, [r13+10240];
ld.shared.v2.f32 {f965, f966}, [r13+11264];
ld.shared.v2.f32 {f969, f970}, [r13+12288];
ld.shared.v2.f32 {f973, f974}, [r13+13312];
ld.shared.v2.f32 {f977, f978}, [r13+14336];
ld.shared.v2.f32 {f981, f982}, [r13+15360];
ld.shared.v2.f32 {f985, f986}, [r13+16384];
ld.shared.v2.f32 {f989, f990}, [r13+17408];
ld.shared.v2.f32 {f993, f994}, [r13+18432];
ld.shared.v2.f32 {f997, f998}, [r13+19456];
ld.shared.v2.f32 {f1001, f1002}, [r13+20480];
ld.shared.v2.f32 {f1005, f1006}, [r13+21504];
ld.shared.v2.f32 {f1009, f1010}, [r13+22528];
ld.shared.v2.f32 {f1013, f1014}, [r13+23552];
ld.shared.v2.f32 {f1017, f1018}, [r13+24576];
ld.shared.v2.f32 {f1021, f1022}, [r13+25600];
ld.shared.v2.f32 {f1025, f1026}, [r13+26624];
ld.shared.v2.f32 {f1029, f1030}, [r13+27648];
ld.shared.v2.f32 {f1033, f1034}, [r13+28672];
ld.shared.v2.f32 {f1037, f1038}, [r13+29696];
ld.shared.v2.f32 {f1041, f1042}, [r13+30720];
ld.shared.v2.f32 {f1045, f1046}, [r13+31744];
add.f32 f1049, f921, f985;
sub.f32 f1051, f921, f985;
add.f32 f2332, f922, f986;
sub.f32 f1052, f922, f986;
add.f32 f1053, f953, f1017;
sub.f32 f1055, f953, f1017;
add.f32 f2331, f954, f1018;
sub.f32 f1056, f954, f1018;
add.f32 f1057, f1049, f1053;
sub.f32 f1059, f1049, f1053;
add.f32 f2330, f2332, f2331;
sub.f32 f1060, f2332, f2331;
sub.f32 f1061, f1051, f1056;
add.f32 f1063, f1051, f1056;
add.f32 f2329, f1052, f1055;
sub.f32 f1064, f1052, f1055;
add.f32 f1065, f937, f1001;
sub.f32 f1067, f937, f1001;
add.f32 f2328, f938, f1002;
sub.f32 f1068, f938, f1002;
add.f32 f1069, f969, f1033;
sub.f32 f1071, f969, f1033;
add.f32 f2327, f970, f1034;
sub.f32 f1072, f970, f1034;
add.f32 f1073, f1065, f1069;
sub.f32 f1075, f1065, f1069;
add.f32 f2326, f2328, f2327;
sub.f32 f1076, f2328, f2327;
sub.f32 f1077, f1067, f1072;
add.f32 f1079, f1067, f1072;
add.f32 f2325, f1068, f1071;
sub.f32 f1080, f1068, f1071;
mul.f32 f1081, f1077, 0f3F3504F3;
mul.f32 f1082, f2325, 0f3F3504F3;
sub.f32 f1083, f1081, f1082;
add.f32 f1084, f1081, f1082;
mul.f32 f2323, f1079, 0fBF3504F3;
mul.f32 f2324, f1080, 0f3F3504F3;
sub.f32 f1087, f2323, f2324;
mul.f32 f1088, f1080, 0fBF3504F3;
fma.rn.f32 f1089, f1079, 0f3F3504F3, f1088;
add.f32 f1090, f1057, f1073;
sub.f32 f1092, f1057, f1073;
add.f32 f2322, f2330, f2326;
sub.f32 f1093, f2330, f2326;
add.f32 f1094, f1061, f1083;
sub.f32 f1096, f1061, f1083;
add.f32 f2321, f2329, f1084;
sub.f32 f1097, f2329, f1084;
sub.f32 f1098, f1059, f1076;
add.f32 f1100, f1059, f1076;
add.f32 f2320, f1060, f1075;
sub.f32 f1101, f1060, f1075;
add.f32 f1102, f1063, f1087;
sub.f32 f1104, f1063, f1087;
add.f32 f2319, f1064, f1089;
sub.f32 f1105, f1064, f1089;
add.f32 f1106, f929, f993;
sub.f32 f1108, f929, f993;
add.f32 f2318, f930, f994;
sub.f32 f1109, f930, f994;
add.f32 f1110, f961, f1025;
sub.f32 f1112, f961, f1025;
add.f32 f2317, f962, f1026;
sub.f32 f1113, f962, f1026;
add.f32 f1114, f1106, f1110;
sub.f32 f1116, f1106, f1110;
add.f32 f2316, f2318, f2317;
sub.f32 f1117, f2318, f2317;
sub.f32 f1118, f1108, f1113;
add.f32 f1120, f1108, f1113;
add.f32 f2315, f1109, f1112;
sub.f32 f1121, f1109, f1112;
add.f32 f1122, f945, f1009;
sub.f32 f1124, f945, f1009;
add.f32 f2314, f946, f1010;
sub.f32 f1125, f946, f1010;
add.f32 f1126, f977, f1041;
sub.f32 f1128, f977, f1041;
add.f32 f2313, f978, f1042;
sub.f32 f1129, f978, f1042;
add.f32 f1130, f1122, f1126;
sub.f32 f1132, f1122, f1126;
add.f32 f2312, f2314, f2313;
sub.f32 f1133, f2314, f2313;
sub.f32 f1134, f1124, f1129;
add.f32 f1136, f1124, f1129;
add.f32 f2311, f1125, f1128;
sub.f32 f1137, f1125, f1128;
mul.f32 f1138, f1134, 0f3F3504F3;
mul.f32 f1139, f2311, 0f3F3504F3;
sub.f32 f1140, f1138, f1139;
add.f32 f1141, f1138, f1139;
mul.f32 f2309, f1136, 0fBF3504F3;
mul.f32 f2310, f1137, 0f3F3504F3;
sub.f32 f1144, f2309, f2310;
mul.f32 f1145, f1137, 0fBF3504F3;
fma.rn.f32 f1146, f1136, 0f3F3504F3, f1145;
add.f32 f1147, f1114, f1130;
sub.f32 f1149, f1114, f1130;
add.f32 f2308, f2316, f2312;
sub.f32 f1150, f2316, f2312;
add.f32 f1151, f1118, f1140;
sub.f32 f1153, f1118, f1140;
add.f32 f2307, f2315, f1141;
sub.f32 f1154, f2315, f1141;
sub.f32 f1155, f1116, f1133;
add.f32 f1157, f1116, f1133;
add.f32 f2306, f1117, f1132;
sub.f32 f1158, f1117, f1132;
add.f32 f1159, f1120, f1144;
sub.f32 f1161, f1120, f1144;
add.f32 f2305, f1121, f1146;
sub.f32 f1162, f1121, f1146;
mul.f32 f2303, f1151, 0f3F6C835E;
mul.f32 f2304, f2307, 0f3EC3EF15;
sub.f32 f1165, f2303, f2304;
mul.f32 f1166, f2307, 0f3F6C835E;
fma.rn.f32 f1167, f1151, 0f3EC3EF15, f1166;
mul.f32 f1168, f1155, 0f3F3504F3;
mul.f32 f1169, f2306, 0f3F3504F3;
sub.f32 f1170, f1168, f1169;
add.f32 f1171, f1168, f1169;
mul.f32 f2301, f1159, 0f3EC3EF15;
mul.f32 f2302, f2305, 0f3F6C835E;
sub.f32 f1174, f2301, f2302;
mul.f32 f1175, f2305, 0f3EC3EF15;
fma.rn.f32 f1176, f1159, 0f3F6C835E, f1175;
mul.f32 f2299, f1153, 0fBEC3EF15;
mul.f32 f2300, f1154, 0f3F6C835E;
sub.f32 f1179, f2299, f2300;
mul.f32 f1180, f1154, 0fBEC3EF15;
fma.rn.f32 f1181, f1153, 0f3F6C835E, f1180;
mul.f32 f2297, f1157, 0fBF3504F3;
mul.f32 f2298, f1158, 0f3F3504F3;
sub.f32 f1184, f2297, f2298;
mul.f32 f1185, f1158, 0fBF3504F3;
fma.rn.f32 f1186, f1157, 0f3F3504F3, f1185;
mul.f32 f2295, f1161, 0fBF6C835E;
mul.f32 f2296, f1162, 0f3EC3EF15;
sub.f32 f1189, f2295, f2296;
mul.f32 f1190, f1162, 0fBF6C835E;
fma.rn.f32 f1191, f1161, 0f3EC3EF15, f1190;
add.f32 f1192, f1090, f1147;
sub.f32 f1194, f1090, f1147;
add.f32 f2294, f2322, f2308;
sub.f32 f1195, f2322, f2308;
add.f32 f1196, f1094, f1165;
sub.f32 f1198, f1094, f1165;
add.f32 f2293, f2321, f1167;
sub.f32 f1199, f2321, f1167;
add.f32 f1200, f1098, f1170;
sub.f32 f1202, f1098, f1170;
add.f32 f2292, f2320, f1171;
sub.f32 f1203, f2320, f1171;
add.f32 f1204, f1102, f1174;
sub.f32 f1206, f1102, f1174;
add.f32 f2291, f2319, f1176;
sub.f32 f1207, f2319, f1176;
sub.f32 f1208, f1092, f1150;
add.f32 f1210, f1092, f1150;
add.f32 f2290, f1093, f1149;
sub.f32 f1211, f1093, f1149;
add.f32 f1212, f1096, f1179;
sub.f32 f1214, f1096, f1179;
add.f32 f2289, f1097, f1181;
sub.f32 f1215, f1097, f1181;
add.f32 f1216, f1100, f1184;
sub.f32 f1218, f1100, f1184;
add.f32 f2288, f1101, f1186;
sub.f32 f1219, f1101, f1186;
add.f32 f1220, f1104, f1189;
sub.f32 f1222, f1104, f1189;
add.f32 f2287, f1105, f1191;
sub.f32 f1223, f1105, f1191;
add.f32 f1224, f925, f989;
sub.f32 f1226, f925, f989;
add.f32 f2286, f926, f990;
sub.f32 f1227, f926, f990;
add.f32 f1228, f957, f1021;
sub.f32 f1230, f957, f1021;
add.f32 f2285, f958, f1022;
sub.f32 f1231, f958, f1022;
add.f32 f1232, f1224, f1228;
sub.f32 f1234, f1224, f1228;
add.f32 f2284, f2286, f2285;
sub.f32 f1235, f2286, f2285;
sub.f32 f1236, f1226, f1231;
add.f32 f1238, f1226, f1231;
add.f32 f2283, f1227, f1230;
sub.f32 f1239, f1227, f1230;
add.f32 f1240, f941, f1005;
sub.f32 f1242, f941, f1005;
add.f32 f2282, f942, f1006;
sub.f32 f1243, f942, f1006;
add.f32 f1244, f973, f1037;
sub.f32 f1246, f973, f1037;
add.f32 f2281, f974, f1038;
sub.f32 f1247, f974, f1038;
add.f32 f1248, f1240, f1244;
sub.f32 f1250, f1240, f1244;
add.f32 f2280, f2282, f2281;
sub.f32 f1251, f2282, f2281;
sub.f32 f1252, f1242, f1247;
add.f32 f1254, f1242, f1247;
add.f32 f2279, f1243, f1246;
sub.f32 f1255, f1243, f1246;
mul.f32 f1256, f1252, 0f3F3504F3;
mul.f32 f1257, f2279, 0f3F3504F3;
sub.f32 f1258, f1256, f1257;
add.f32 f1259, f1256, f1257;
mul.f32 f2277, f1254, 0fBF3504F3;
mul.f32 f2278, f1255, 0f3F3504F3;
sub.f32 f1262, f2277, f2278;
mul.f32 f1263, f1255, 0fBF3504F3;
fma.rn.f32 f1264, f1254, 0f3F3504F3, f1263;
add.f32 f1265, f1232, f1248;
sub.f32 f1267, f1232, f1248;
add.f32 f2276, f2284, f2280;
sub.f32 f1268, f2284, f2280;
add.f32 f1269, f1236, f1258;
sub.f32 f1271, f1236, f1258;
add.f32 f2275, f2283, f1259;
sub.f32 f1272, f2283, f1259;
sub.f32 f1273, f1234, f1251;
add.f32 f1275, f1234, f1251;
add.f32 f2274, f1235, f1250;
sub.f32 f1276, f1235, f1250;
add.f32 f1277, f1238, f1262;
sub.f32 f1279, f1238, f1262;
add.f32 f2273, f1239, f1264;
sub.f32 f1280, f1239, f1264;
add.f32 f1281, f933, f997;
sub.f32 f1283, f933, f997;
add.f32 f2272, f934, f998;
sub.f32 f1284, f934, f998;
add.f32 f1285, f965, f1029;
sub.f32 f1287, f965, f1029;
add.f32 f2271, f966, f1030;
sub.f32 f1288, f966, f1030;
add.f32 f1289, f1281, f1285;
sub.f32 f1291, f1281, f1285;
add.f32 f2270, f2272, f2271;
sub.f32 f1292, f2272, f2271;
sub.f32 f1293, f1283, f1288;
add.f32 f1295, f1283, f1288;
add.f32 f2269, f1284, f1287;
sub.f32 f1296, f1284, f1287;
add.f32 f1297, f949, f1013;
sub.f32 f1299, f949, f1013;
add.f32 f2268, f950, f1014;
sub.f32 f1300, f950, f1014;
add.f32 f1301, f981, f1045;
sub.f32 f1303, f981, f1045;
add.f32 f2267, f982, f1046;
sub.f32 f1304, f982, f1046;
add.f32 f1305, f1297, f1301;
sub.f32 f1307, f1297, f1301;
add.f32 f2266, f2268, f2267;
sub.f32 f1308, f2268, f2267;
sub.f32 f1309, f1299, f1304;
add.f32 f1311, f1299, f1304;
add.f32 f2265, f1300, f1303;
sub.f32 f1312, f1300, f1303;
mul.f32 f1313, f1309, 0f3F3504F3;
mul.f32 f1314, f2265, 0f3F3504F3;
sub.f32 f1315, f1313, f1314;
add.f32 f1316, f1313, f1314;
mul.f32 f2263, f1311, 0fBF3504F3;
mul.f32 f2264, f1312, 0f3F3504F3;
sub.f32 f1319, f2263, f2264;
mul.f32 f1320, f1312, 0fBF3504F3;
fma.rn.f32 f1321, f1311, 0f3F3504F3, f1320;
add.f32 f1322, f1289, f1305;
sub.f32 f1324, f1289, f1305;
add.f32 f2262, f2270, f2266;
sub.f32 f1325, f2270, f2266;
add.f32 f1326, f1293, f1315;
sub.f32 f1328, f1293, f1315;
add.f32 f2261, f2269, f1316;
sub.f32 f1329, f2269, f1316;
sub.f32 f1330, f1291, f1308;
add.f32 f1332, f1291, f1308;
add.f32 f2260, f1292, f1307;
sub.f32 f1333, f1292, f1307;
add.f32 f1334, f1295, f1319;
sub.f32 f1336, f1295, f1319;
add.f32 f2259, f1296, f1321;
sub.f32 f1337, f1296, f1321;
mul.f32 f2257, f1326, 0f3F6C835E;
mul.f32 f2258, f2261, 0f3EC3EF15;
sub.f32 f1340, f2257, f2258;
mul.f32 f1341, f2261, 0f3F6C835E;
fma.rn.f32 f1342, f1326, 0f3EC3EF15, f1341;
mul.f32 f1343, f1330, 0f3F3504F3;
mul.f32 f1344, f2260, 0f3F3504F3;
sub.f32 f1345, f1343, f1344;
add.f32 f1346, f1343, f1344;
mul.f32 f1348, f2259, 0f3F6C835E;
mul.f32 f2256, f1334, 0f3EC3EF15;
sub.f32 f1349, f2256, f1348;
mul.f32 f1350, f2259, 0f3EC3EF15;
fma.rn.f32 f1351, f1334, 0f3F6C835E, f1350;
mul.f32 f1353, f1329, 0f3F6C835E;
mul.f32 f2255, f1328, 0fBEC3EF15;
sub.f32 f1354, f2255, f1353;
mul.f32 f1355, f1329, 0fBEC3EF15;
fma.rn.f32 f1356, f1328, 0f3F6C835E, f1355;
mul.f32 f2253, f1332, 0fBF3504F3;
mul.f32 f2254, f1333, 0f3F3504F3;
sub.f32 f1359, f2253, f2254;
mul.f32 f1360, f1333, 0fBF3504F3;
fma.rn.f32 f1361, f1332, 0f3F3504F3, f1360;
mul.f32 f2251, f1336, 0fBF6C835E;
mul.f32 f2252, f1337, 0f3EC3EF15;
sub.f32 f1364, f2251, f2252;
mul.f32 f1365, f1337, 0fBF6C835E;
fma.rn.f32 f1366, f1336, 0f3EC3EF15, f1365;
add.f32 f1367, f1265, f1322;
sub.f32 f1369, f1265, f1322;
add.f32 f2250, f2276, f2262;
sub.f32 f1370, f2276, f2262;
add.f32 f1371, f1269, f1340;
sub.f32 f1373, f1269, f1340;
add.f32 f2249, f2275, f1342;
sub.f32 f1374, f2275, f1342;
add.f32 f1375, f1273, f1345;
sub.f32 f1377, f1273, f1345;
add.f32 f2248, f2274, f1346;
sub.f32 f1378, f2274, f1346;
add.f32 f1379, f1277, f1349;
sub.f32 f1381, f1277, f1349;
add.f32 f2247, f2273, f1351;
sub.f32 f1382, f2273, f1351;
sub.f32 f1383, f1267, f1325;
add.f32 f1385, f1267, f1325;
add.f32 f2246, f1268, f1324;
sub.f32 f1386, f1268, f1324;
add.f32 f1387, f1271, f1354;
sub.f32 f1389, f1271, f1354;
add.f32 f2245, f1272, f1356;
sub.f32 f1390, f1272, f1356;
add.f32 f1391, f1275, f1359;
sub.f32 f1393, f1275, f1359;
add.f32 f2244, f1276, f1361;
sub.f32 f1394, f1276, f1361;
add.f32 f1395, f1279, f1364;
sub.f32 f1397, f1279, f1364;
add.f32 f2243, f1280, f1366;
sub.f32 f1398, f1280, f1366;
mul.f32 f1400, f2249, 0f3E47C5C2;
mul.f32 f2242, f1371, 0f3F7B14BE;
sub.f32 f1401, f2242, f1400;
mul.f32 f1402, f2249, 0f3F7B14BE;
fma.rn.f32 f1403, f1371, 0f3E47C5C2, f1402;
mul.f32 f1405, f2248, 0f3EC3EF15;
mul.f32 f2241, f1375, 0f3F6C835E;
sub.f32 f1406, f2241, f1405;
mul.f32 f1407, f2248, 0f3F6C835E;
fma.rn.f32 f1408, f1375, 0f3EC3EF15, f1407;
mul.f32 f2239, f1379, 0f3F54DB31;
mul.f32 f2240, f2247, 0f3F0E39DA;
sub.f32 f1411, f2239, f2240;
mul.f32 f1412, f2247, 0f3F54DB31;
fma.rn.f32 f1413, f1379, 0f3F0E39DA, f1412;
mul.f32 f1414, f1383, 0f3F3504F3;
mul.f32 f1415, f2246, 0f3F3504F3;
sub.f32 f1416, f1414, f1415;
add.f32 f1417, f1414, f1415;
mul.f32 f1419, f2245, 0f3F54DB31;
mul.f32 f2238, f1387, 0f3F0E39DA;
sub.f32 f1420, f2238, f1419;
mul.f32 f1421, f2245, 0f3F0E39DA;
fma.rn.f32 f1422, f1387, 0f3F54DB31, f1421;
mul.f32 f1424, f2244, 0f3F6C835E;
mul.f32 f2237, f1391, 0f3EC3EF15;
sub.f32 f1425, f2237, f1424;
mul.f32 f1426, f2244, 0f3EC3EF15;
fma.rn.f32 f1427, f1391, 0f3F6C835E, f1426;
mul.f32 f2235, f1395, 0f3E47C5C2;
mul.f32 f2236, f2243, 0f3F7B14BE;
sub.f32 f1430, f2235, f2236;
mul.f32 f1431, f2243, 0f3E47C5C2;
fma.rn.f32 f1432, f1395, 0f3F7B14BE, f1431;
mul.f32 f2233, f1373, 0fBE47C5C2;
mul.f32 f2234, f1374, 0f3F7B14BE;
sub.f32 f1435, f2233, f2234;
mul.f32 f1436, f1374, 0fBE47C5C2;
fma.rn.f32 f1437, f1373, 0f3F7B14BE, f1436;
mul.f32 f2231, f1377, 0fBEC3EF15;
mul.f32 f2232, f1378, 0f3F6C835E;
sub.f32 f1440, f2231, f2232;
mul.f32 f1441, f1378, 0fBEC3EF15;
fma.rn.f32 f1442, f1377, 0f3F6C835E, f1441;
mul.f32 f2229, f1381, 0fBF0E39DA;
mul.f32 f2230, f1382, 0f3F54DB31;
sub.f32 f1445, f2229, f2230;
mul.f32 f1446, f1382, 0fBF0E39DA;
fma.rn.f32 f1447, f1381, 0f3F54DB31, f1446;
mul.f32 f1449, f1386, 0f3F3504F3;
mul.f32 f2228, f1385, 0fBF3504F3;
sub.f32 f1450, f2228, f1449;
mul.f32 f1451, f1386, 0fBF3504F3;
fma.rn.f32 f1452, f1385, 0f3F3504F3, f1451;
mul.f32 f1454, f1390, 0f3F0E39DA;
mul.f32 f2227, f1389, 0fBF54DB31;
sub.f32 f1455, f2227, f1454;
mul.f32 f1456, f1390, 0fBF54DB31;
fma.rn.f32 f1457, f1389, 0f3F0E39DA, f1456;
mul.f32 f1459, f1394, 0f3EC3EF15;
mul.f32 f2226, f1393, 0fBF6C835E;
sub.f32 f1460, f2226, f1459;
mul.f32 f1461, f1394, 0fBF6C835E;
fma.rn.f32 f1462, f1393, 0f3EC3EF15, f1461;
mul.f32 f1464, f1398, 0f3E47C5C2;
mul.f32 f2225, f1397, 0fBF7B14BE;
sub.f32 f1465, f2225, f1464;
mul.f32 f1466, f1398, 0fBF7B14BE;
fma.rn.f32 f1467, f1397, 0f3E47C5C2, f1466;
add.f32 f1470, f1196, f1401;
sub.f32 f1472, f1196, f1401;
add.f32 f2224, f2293, f1403;
sub.f32 f1473, f2293, f1403;
add.f32 f1474, f1200, f1406;
sub.f32 f1476, f1200, f1406;
add.f32 f2223, f2292, f1408;
sub.f32 f1477, f2292, f1408;
add.f32 f1478, f1204, f1411;
sub.f32 f1480, f1204, f1411;
add.f32 f2222, f2291, f1413;
sub.f32 f1481, f2291, f1413;
add.f32 f1482, f1208, f1416;
sub.f32 f1484, f1208, f1416;
add.f32 f2221, f2290, f1417;
sub.f32 f1485, f2290, f1417;
add.f32 f1486, f1212, f1420;
sub.f32 f1488, f1212, f1420;
add.f32 f2220, f2289, f1422;
sub.f32 f1489, f2289, f1422;
add.f32 f1490, f1216, f1425;
sub.f32 f1492, f1216, f1425;
add.f32 f2219, f2288, f1427;
sub.f32 f1493, f2288, f1427;
add.f32 f1494, f1220, f1430;
sub.f32 f1496, f1220, f1430;
add.f32 f2218, f2287, f1432;
sub.f32 f1497, f2287, f1432;
sub.f32 f1498, f1194, f1370;
add.f32 f1500, f1194, f1370;
add.f32 f2217, f1195, f1369;
sub.f32 f1501, f1195, f1369;
add.f32 f1502, f1198, f1435;
sub.f32 f1504, f1198, f1435;
add.f32 f2216, f1199, f1437;
sub.f32 f1505, f1199, f1437;
add.f32 f1506, f1202, f1440;
sub.f32 f1508, f1202, f1440;
add.f32 f2215, f1203, f1442;
sub.f32 f1509, f1203, f1442;
add.f32 f1510, f1206, f1445;
sub.f32 f1512, f1206, f1445;
add.f32 f2214, f1207, f1447;
sub.f32 f1513, f1207, f1447;
add.f32 f1514, f1210, f1450;
sub.f32 f1516, f1210, f1450;
add.f32 f2213, f1211, f1452;
sub.f32 f1517, f1211, f1452;
add.f32 f1518, f1214, f1455;
sub.f32 f1520, f1214, f1455;
add.f32 f2212, f1215, f1457;
sub.f32 f1521, f1215, f1457;
add.f32 f1522, f1218, f1460;
sub.f32 f1524, f1218, f1460;
add.f32 f2211, f1219, f1462;
sub.f32 f1525, f1219, f1462;
add.f32 f1526, f1222, f1465;
sub.f32 f1528, f1222, f1465;
add.f32 f2210, f1223, f1467;
sub.f32 f1529, f1223, f1467;
and.b32 r14, r39, 96;
bfe.u32 r15, r39, 5, 2;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %66;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f1530, f1531}, [rd8];
mul.f32 f1534, f2224, f1531;
mul.f32 f1536, f1530, f2224;
mul.f32 f2208, f1530, f1530;
mul.f32 f2209, f1531, f1531;
sub.f32 f1539, f2208, f2209;
mul.f32 f1540, f1531, f1530;
fma.rn.f32 f1541, f1531, f1530, f1540;
mul.f32 f1542, f2223, f1541;
mul.f32 f1544, f1539, f2223;
mul.f32 f1546, f1531, f1541;
mul.f32 f2207, f1530, f1539;
sub.f32 f1547, f2207, f1546;
mul.f32 f2206, f1474, f1541;
mul.f32 f1548, f1530, f1541;
fma.rn.f32 f1549, f1531, f1539, f1548;
mul.f32 f1550, f2222, f1549;
mul.f32 f1552, f1547, f2222;
mul.f32 f1554, f1531, f1549;
mul.f32 f2205, f1530, f1547;
sub.f32 f1555, f2205, f1554;
mul.f32 f2204, f1478, f1549;
mul.f32 f1556, f1530, f1549;
fma.rn.f32 f1557, f1531, f1547, f1556;
mul.f32 f1558, f2221, f1557;
mul.f32 f1560, f1555, f2221;
mul.f32 f2202, f1530, f1555;
mul.f32 f2203, f1531, f1557;
sub.f32 f1563, f2202, f2203;
mul.f32 f2201, f1482, f1557;
mul.f32 f1564, f1530, f1557;
fma.rn.f32 f1565, f1531, f1555, f1564;
mul.f32 f1566, f2220, f1565;
mul.f32 f1568, f1563, f2220;
mul.f32 f1570, f1531, f1565;
mul.f32 f2200, f1530, f1563;
sub.f32 f1571, f2200, f1570;
mul.f32 f2199, f1486, f1565;
mul.f32 f1572, f1530, f1565;
fma.rn.f32 f1573, f1531, f1563, f1572;
mul.f32 f1574, f2219, f1573;
mul.f32 f1576, f1571, f2219;
mul.f32 f1578, f1531, f1573;
mul.f32 f2198, f1530, f1571;
sub.f32 f1579, f2198, f1578;
mul.f32 f2197, f1490, f1573;
mul.f32 f1580, f1530, f1573;
fma.rn.f32 f1581, f1531, f1571, f1580;
mul.f32 f1582, f2218, f1581;
mul.f32 f1584, f1579, f2218;
mul.f32 f2195, f1530, f1579;
mul.f32 f2196, f1531, f1581;
sub.f32 f1587, f2195, f2196;
mul.f32 f2194, f1494, f1581;
mul.f32 f1588, f1530, f1581;
fma.rn.f32 f1589, f1531, f1579, f1588;
mul.f32 f1590, f2217, f1589;
mul.f32 f1592, f1587, f2217;
mul.f32 f1594, f1531, f1589;
mul.f32 f2193, f1530, f1587;
sub.f32 f1595, f2193, f1594;
mul.f32 f2192, f1498, f1589;
mul.f32 f1596, f1530, f1589;
fma.rn.f32 f1597, f1531, f1587, f1596;
mul.f32 f1598, f2216, f1597;
mul.f32 f1600, f1595, f2216;
mul.f32 f2190, f1530, f1595;
mul.f32 f2191, f1531, f1597;
sub.f32 f1603, f2190, f2191;
mul.f32 f2189, f1502, f1597;
mul.f32 f1604, f1530, f1597;
fma.rn.f32 f1605, f1531, f1595, f1604;
mul.f32 f1606, f2215, f1605;
mul.f32 f1608, f1603, f2215;
mul.f32 f1610, f1531, f1605;
mul.f32 f2188, f1530, f1603;
sub.f32 f1611, f2188, f1610;
mul.f32 f2187, f1506, f1605;
mul.f32 f1612, f1530, f1605;
fma.rn.f32 f1613, f1531, f1603, f1612;
mul.f32 f1614, f2214, f1613;
mul.f32 f1616, f1611, f2214;
mul.f32 f1618, f1531, f1613;
mul.f32 f2186, f1530, f1611;
sub.f32 f1619, f2186, f1618;
mul.f32 f2185, f1510, f1613;
mul.f32 f1620, f1530, f1613;
fma.rn.f32 f1621, f1531, f1611, f1620;
mul.f32 f1622, f2213, f1621;
mul.f32 f1624, f1619, f2213;
mul.f32 f2183, f1530, f1619;
mul.f32 f2184, f1531, f1621;
sub.f32 f1627, f2183, f2184;
mul.f32 f2182, f1514, f1621;
mul.f32 f1628, f1530, f1621;
fma.rn.f32 f1629, f1531, f1619, f1628;
mul.f32 f1630, f2212, f1629;
mul.f32 f1632, f1627, f2212;
mul.f32 f1634, f1531, f1629;
mul.f32 f2181, f1530, f1627;
sub.f32 f1635, f2181, f1634;
mul.f32 f2180, f1518, f1629;
mul.f32 f1636, f1530, f1629;
fma.rn.f32 f1637, f1531, f1627, f1636;
mul.f32 f1638, f2211, f1637;
mul.f32 f1640, f1635, f2211;
mul.f32 f1642, f1531, f1637;
mul.f32 f2179, f1530, f1635;
sub.f32 f1643, f2179, f1642;
mul.f32 f2178, f1522, f1637;
mul.f32 f1644, f1530, f1637;
fma.rn.f32 f1645, f1531, f1635, f1644;
mul.f32 f1646, f2210, f1645;
mul.f32 f1648, f1643, f2210;
mul.f32 f2176, f1530, f1643;
mul.f32 f2177, f1531, f1645;
sub.f32 f1651, f2176, f2177;
mul.f32 f2175, f1526, f1645;
mul.f32 f1652, f1530, f1645;
fma.rn.f32 f1653, f1531, f1643, f1652;
sub.f32 f2174, f2294, f2250;
mul.f32 f1654, f2174, f1653;
mul.f32 f1656, f1651, f2174;
mul.f32 f1658, f1531, f1653;
mul.f32 f2173, f1530, f1651;
sub.f32 f1659, f2173, f1658;
sub.f32 f2172, f1192, f1367;
mul.f32 f2171, f2172, f1653;
mul.f32 f1660, f1530, f1653;
fma.rn.f32 f1661, f1531, f1651, f1660;
mul.f32 f1662, f1473, f1661;
mul.f32 f1664, f1659, f1473;
mul.f32 f2169, f1530, f1659;
mul.f32 f2170, f1531, f1661;
sub.f32 f1667, f2169, f2170;
mul.f32 f2168, f1472, f1661;
mul.f32 f1668, f1530, f1661;
fma.rn.f32 f1669, f1531, f1659, f1668;
mul.f32 f1670, f1477, f1669;
mul.f32 f1672, f1667, f1477;
mul.f32 f1674, f1531, f1669;
mul.f32 f2167, f1530, f1667;
sub.f32 f1675, f2167, f1674;
mul.f32 f2166, f1476, f1669;
mul.f32 f1676, f1530, f1669;
fma.rn.f32 f1677, f1531, f1667, f1676;
mul.f32 f1678, f1481, f1677;
mul.f32 f1680, f1675, f1481;
mul.f32 f1682, f1531, f1677;
mul.f32 f2165, f1530, f1675;
sub.f32 f1683, f2165, f1682;
mul.f32 f2164, f1480, f1677;
mul.f32 f1684, f1530, f1677;
fma.rn.f32 f1685, f1531, f1675, f1684;
mul.f32 f1686, f1485, f1685;
mul.f32 f1688, f1683, f1485;
mul.f32 f2162, f1530, f1683;
mul.f32 f2163, f1531, f1685;
sub.f32 f1691, f2162, f2163;
mul.f32 f2161, f1484, f1685;
mul.f32 f1692, f1530, f1685;
fma.rn.f32 f1693, f1531, f1683, f1692;
mul.f32 f1694, f1489, f1693;
mul.f32 f1696, f1691, f1489;
mul.f32 f1698, f1531, f1693;
mul.f32 f2160, f1530, f1691;
sub.f32 f1699, f2160, f1698;
mul.f32 f2159, f1488, f1693;
mul.f32 f1700, f1530, f1693;
fma.rn.f32 f1701, f1531, f1691, f1700;
mul.f32 f1702, f1493, f1701;
mul.f32 f1704, f1699, f1493;
mul.f32 f1706, f1531, f1701;
mul.f32 f2158, f1530, f1699;
sub.f32 f1707, f2158, f1706;
mul.f32 f2157, f1492, f1701;
mul.f32 f1708, f1530, f1701;
fma.rn.f32 f1709, f1531, f1699, f1708;
mul.f32 f1710, f1497, f1709;
mul.f32 f1712, f1707, f1497;
mul.f32 f2155, f1530, f1707;
mul.f32 f2156, f1531, f1709;
sub.f32 f1715, f2155, f2156;
mul.f32 f2154, f1496, f1709;
mul.f32 f1716, f1530, f1709;
fma.rn.f32 f1717, f1531, f1707, f1716;
mul.f32 f1718, f1501, f1717;
mul.f32 f1720, f1715, f1501;
mul.f32 f1722, f1531, f1717;
mul.f32 f2153, f1530, f1715;
sub.f32 f1723, f2153, f1722;
mul.f32 f2152, f1500, f1717;
mul.f32 f1724, f1530, f1717;
fma.rn.f32 f1725, f1531, f1715, f1724;
mul.f32 f1726, f1505, f1725;
mul.f32 f1728, f1723, f1505;
mul.f32 f2150, f1530, f1723;
mul.f32 f2151, f1531, f1725;
sub.f32 f1731, f2150, f2151;
mul.f32 f2149, f1504, f1725;
mul.f32 f1732, f1530, f1725;
fma.rn.f32 f1733, f1531, f1723, f1732;
mul.f32 f1734, f1509, f1733;
mul.f32 f1736, f1731, f1509;
mul.f32 f1738, f1531, f1733;
mul.f32 f2148, f1530, f1731;
sub.f32 f1739, f2148, f1738;
mul.f32 f2147, f1508, f1733;
mul.f32 f1740, f1530, f1733;
fma.rn.f32 f1741, f1531, f1731, f1740;
mul.f32 f1742, f1513, f1741;
mul.f32 f1744, f1739, f1513;
mul.f32 f1746, f1531, f1741;
mul.f32 f2146, f1530, f1739;
sub.f32 f1747, f2146, f1746;
mul.f32 f2145, f1512, f1741;
mul.f32 f1748, f1530, f1741;
fma.rn.f32 f1749, f1531, f1739, f1748;
mul.f32 f1750, f1517, f1749;
mul.f32 f1752, f1747, f1517;
mul.f32 f2143, f1530, f1747;
mul.f32 f2144, f1531, f1749;
sub.f32 f1755, f2143, f2144;
mul.f32 f2142, f1516, f1749;
mul.f32 f1756, f1530, f1749;
fma.rn.f32 f1757, f1531, f1747, f1756;
mul.f32 f1758, f1521, f1757;
mul.f32 f1760, f1755, f1521;
mul.f32 f1762, f1531, f1757;
mul.f32 f2141, f1530, f1755;
sub.f32 f1763, f2141, f1762;
mul.f32 f2140, f1520, f1757;
mul.f32 f1764, f1530, f1757;
fma.rn.f32 f1765, f1531, f1755, f1764;
mul.f32 f1766, f1525, f1765;
mul.f32 f1768, f1763, f1525;
mul.f32 f1770, f1531, f1765;
mul.f32 f2139, f1530, f1763;
sub.f32 f1771, f2139, f1770;
mul.f32 f2138, f1524, f1765;
mul.f32 f1772, f1530, f1765;
mul.f32 f2137, f1470, f1531;
fma.rn.f32 f1773, f1531, f1763, f1772;
mul.f32 f1774, f1529, f1773;
mul.f32 f1775, f1528, f1773;
mul.f32 f1776, f1771, f1529;
and.b32 r16, r35, 248;
add.s32 r17, r9, r16;
sub.f32 f2543, f2294, f2250;
mul.f32 f2542, f1651, f2543;
mov.u32 r34, %tid.x;
shl.b32 r33, r34, 8;
barrier.sync 0;
and.b32 r18, r33, 24576;
add.s32 r19, r17, r18;
mov.u32 r26, %tid.x;
and.b32 r25, r26, 96;
sub.f32 f2546, f2294, f2250;
mul.f32 f2545, f1651, f2546;
add.f32 f1777, f2294, f2250;
mov.u32 r30, %tid.x;
and.b32 r29, r30, 96;
add.f32 f1778, f1192, f1367;
st.shared.v2.f32 [r19], {f1778, f1777};
mov.u32 r32, %tid.x;
and.b32 r31, r32, 96;
fma.rn.f32 f1779, f1530, f1470, f1534;
sub.f32 f1780, f1536, f2137;
st.shared.v2.f32 [r19+256], {f1779, f1780};
fma.rn.f32 f1781, f1539, f1474, f1542;
sub.f32 f1782, f1544, f2206;
st.shared.v2.f32 [r19+512], {f1781, f1782};
fma.rn.f32 f1783, f1547, f1478, f1550;
sub.f32 f1784, f1552, f2204;
st.shared.v2.f32 [r19+768], {f1783, f1784};
fma.rn.f32 f1785, f1555, f1482, f1558;
sub.f32 f1786, f1560, f2201;
st.shared.v2.f32 [r19+1024], {f1785, f1786};
fma.rn.f32 f1787, f1563, f1486, f1566;
sub.f32 f1788, f1568, f2199;
st.shared.v2.f32 [r19+1280], {f1787, f1788};
sub.f32 f1789, f1576, f2197;
fma.rn.f32 f1790, f1571, f1490, f1574;
st.shared.v2.f32 [r19+1536], {f1790, f1789};
fma.rn.f32 f1791, f1579, f1494, f1582;
sub.f32 f1792, f1584, f2194;
st.shared.v2.f32 [r19+1792], {f1791, f1792};
fma.rn.f32 f1793, f1587, f1498, f1590;
sub.f32 f1794, f1592, f2192;
st.shared.v2.f32 [r19+2048], {f1793, f1794};
fma.rn.f32 f1795, f1595, f1502, f1598;
sub.f32 f1796, f1600, f2189;
st.shared.v2.f32 [r19+2304], {f1795, f1796};
fma.rn.f32 f1797, f1603, f1506, f1606;
sub.f32 f1798, f1608, f2187;
st.shared.v2.f32 [r19+2560], {f1797, f1798};
fma.rn.f32 f1799, f1611, f1510, f1614;
sub.f32 f1800, f1616, f2185;
st.shared.v2.f32 [r19+2816], {f1799, f1800};
fma.rn.f32 f1801, f1619, f1514, f1622;
sub.f32 f1802, f1624, f2182;
st.shared.v2.f32 [r19+3072], {f1801, f1802};
fma.rn.f32 f1803, f1627, f1518, f1630;
sub.f32 f1804, f1632, f2180;
st.shared.v2.f32 [r19+3328], {f1803, f1804};
fma.rn.f32 f1805, f1635, f1522, f1638;
sub.f32 f1806, f1640, f2178;
st.shared.v2.f32 [r19+3584], {f1805, f1806};
fma.rn.f32 f1807, f1643, f1526, f1646;
sub.f32 f1808, f1648, f2175;
st.shared.v2.f32 [r19+3840], {f1807, f1808};
fma.rn.f32 f1809, f1651, f2172, f1654;
sub.f32 f1810, f2545, f2171;
st.shared.v2.f32 [r19+4096], {f1809, f1810};
fma.rn.f32 f1811, f1659, f1472, f1662;
sub.f32 f1812, f1664, f2168;
st.shared.v2.f32 [r19+4352], {f1811, f1812};
fma.rn.f32 f1813, f1667, f1476, f1670;
sub.f32 f1814, f1672, f2166;
st.shared.v2.f32 [r19+4608], {f1813, f1814};
fma.rn.f32 f1815, f1675, f1480, f1678;
sub.f32 f1816, f1680, f2164;
st.shared.v2.f32 [r19+4864], {f1815, f1816};
fma.rn.f32 f1817, f1683, f1484, f1686;
sub.f32 f1818, f1688, f2161;
st.shared.v2.f32 [r19+5120], {f1817, f1818};
fma.rn.f32 f1819, f1691, f1488, f1694;
sub.f32 f1820, f1696, f2159;
st.shared.v2.f32 [r19+5376], {f1819, f1820};
fma.rn.f32 f1821, f1699, f1492, f1702;
sub.f32 f1822, f1704, f2157;
st.shared.v2.f32 [r19+5632], {f1821, f1822};
fma.rn.f32 f1823, f1707, f1496, f1710;
sub.f32 f1824, f1712, f2154;
st.shared.v2.f32 [r19+5888], {f1823, f1824};
fma.rn.f32 f1825, f1715, f1500, f1718;
sub.f32 f1826, f1720, f2152;
st.shared.v2.f32 [r19+6144], {f1825, f1826};
fma.rn.f32 f1827, f1723, f1504, f1726;
sub.f32 f1828, f1728, f2149;
st.shared.v2.f32 [r19+6400], {f1827, f1828};
fma.rn.f32 f1829, f1731, f1508, f1734;
sub.f32 f1830, f1736, f2147;
st.shared.v2.f32 [r19+6656], {f1829, f1830};
fma.rn.f32 f1831, f1739, f1512, f1742;
sub.f32 f1832, f1744, f2145;
st.shared.v2.f32 [r19+6912], {f1831, f1832};
fma.rn.f32 f1833, f1747, f1516, f1750;
sub.f32 f1834, f1752, f2142;
st.shared.v2.f32 [r19+7168], {f1833, f1834};
fma.rn.f32 f1835, f1755, f1520, f1758;
sub.f32 f1836, f1760, f2140;
st.shared.v2.f32 [r19+7424], {f1835, f1836};
fma.rn.f32 f1837, f1763, f1524, f1766;
sub.f32 f1838, f1768, f2138;
st.shared.v2.f32 [r19+7680], {f1837, f1838};
fma.rn.f32 f1839, f1771, f1528, f1774;
sub.f32 f1840, f1776, f1775;
st.shared.v2.f32 [r19+7936], {f1839, f1840};
barrier.sync 0;
mad.lo.s32 r20, r31, -248, r19;
ld.shared.v2.f32 {f1841, f1842}, [r20];
ld.shared.v2.f32 {f1845, f1846}, [r20+1024];
ld.shared.v2.f32 {f1849, f1850}, [r20+2048];
ld.shared.v2.f32 {f1853, f1854}, [r20+3072];
ld.shared.v2.f32 {f1857, f1858}, [r20+4096];
ld.shared.v2.f32 {f1861, f1862}, [r20+5120];
ld.shared.v2.f32 {f1865, f1866}, [r20+6144];
ld.shared.v2.f32 {f1869, f1870}, [r20+7168];
ld.shared.v2.f32 {f1873, f1874}, [r20+8192];
ld.shared.v2.f32 {f1877, f1878}, [r20+9216];
ld.shared.v2.f32 {f1881, f1882}, [r20+10240];
ld.shared.v2.f32 {f1885, f1886}, [r20+11264];
ld.shared.v2.f32 {f1889, f1890}, [r20+12288];
ld.shared.v2.f32 {f1893, f1894}, [r20+13312];
ld.shared.v2.f32 {f1897, f1898}, [r20+14336];
ld.shared.v2.f32 {f1901, f1902}, [r20+15360];
ld.shared.v2.f32 {f1905, f1906}, [r20+16384];
ld.shared.v2.f32 {f1909, f1910}, [r20+17408];
ld.shared.v2.f32 {f1913, f1914}, [r20+18432];
ld.shared.v2.f32 {f1917, f1918}, [r20+19456];
ld.shared.v2.f32 {f1921, f1922}, [r20+20480];
ld.shared.v2.f32 {f1925, f1926}, [r20+21504];
ld.shared.v2.f32 {f1929, f1930}, [r20+22528];
ld.shared.v2.f32 {f1933, f1934}, [r20+23552];
ld.shared.v2.f32 {f1937, f1938}, [r20+24576];
ld.shared.v2.f32 {f1941, f1942}, [r20+25600];
ld.shared.v2.f32 {f1945, f1946}, [r20+26624];
ld.shared.v2.f32 {f1949, f1950}, [r20+27648];
ld.shared.v2.f32 {f1953, f1954}, [r20+28672];
ld.shared.v2.f32 {f1957, f1958}, [r20+29696];
ld.shared.v2.f32 {f1961, f1962}, [r20+30720];
ld.shared.v2.f32 {f1965, f1966}, [r20+31744];
add.f32 f1969, f1841, f1905;
sub.f32 f1971, f1841, f1905;
add.f32 f2136, f1842, f1906;
sub.f32 f1972, f1842, f1906;
add.f32 f1973, f1873, f1937;
sub.f32 f1975, f1873, f1937;
add.f32 f2135, f1874, f1938;
sub.f32 f1976, f1874, f1938;
add.f32 f1977, f1845, f1909;
sub.f32 f1979, f1845, f1909;
add.f32 f2134, f1846, f1910;
sub.f32 f1980, f1846, f1910;
add.f32 f1981, f1877, f1941;
sub.f32 f1983, f1877, f1941;
add.f32 f2133, f1878, f1942;
sub.f32 f1984, f1878, f1942;
add.f32 f1985, f1849, f1913;
sub.f32 f1987, f1849, f1913;
add.f32 f2132, f1850, f1914;
sub.f32 f1988, f1850, f1914;
add.f32 f1989, f1881, f1945;
sub.f32 f1991, f1881, f1945;
add.f32 f2131, f1882, f1946;
sub.f32 f1992, f1882, f1946;
add.f32 f1993, f1853, f1917;
sub.f32 f1995, f1853, f1917;
add.f32 f2130, f1854, f1918;
sub.f32 f1996, f1854, f1918;
add.f32 f1997, f1885, f1949;
sub.f32 f1999, f1885, f1949;
add.f32 f2129, f1886, f1950;
sub.f32 f2000, f1886, f1950;
add.f32 f2001, f1857, f1921;
sub.f32 f2003, f1857, f1921;
add.f32 f2128, f1858, f1922;
sub.f32 f2004, f1858, f1922;
add.f32 f2005, f1889, f1953;
sub.f32 f2007, f1889, f1953;
add.f32 f2127, f1890, f1954;
sub.f32 f2008, f1890, f1954;
add.f32 f2009, f1861, f1925;
sub.f32 f2011, f1861, f1925;
add.f32 f2126, f1862, f1926;
sub.f32 f2012, f1862, f1926;
add.f32 f2013, f1893, f1957;
sub.f32 f2015, f1893, f1957;
add.f32 f2125, f1894, f1958;
sub.f32 f2016, f1894, f1958;
add.f32 f2017, f1865, f1929;
sub.f32 f2019, f1865, f1929;
add.f32 f2124, f1866, f1930;
sub.f32 f2020, f1866, f1930;
add.f32 f2021, f1897, f1961;
sub.f32 f2023, f1897, f1961;
add.f32 f2123, f1898, f1962;
sub.f32 f2024, f1898, f1962;
add.f32 f2025, f1869, f1933;
sub.f32 f2027, f1869, f1933;
add.f32 f2122, f1870, f1934;
sub.f32 f2028, f1870, f1934;
add.f32 f2029, f1901, f1965;
sub.f32 f2031, f1901, f1965;
add.f32 f2121, f1902, f1966;
sub.f32 f2032, f1902, f1966;
add.f32 %0, f1969, f1973;
add.f32 %1, f2136, f2135;
add.f32 %3, f2134, f2133;
add.f32 %2, f1977, f1981;
add.f32 %5, f2132, f2131;
add.f32 %4, f1985, f1989;
add.f32 %7, f2130, f2129;
add.f32 %6, f1993, f1997;
add.f32 %8, f2001, f2005;
add.f32 %9, f2128, f2127;
add.f32 %10, f2009, f2013;
add.f32 %11, f2126, f2125;
add.f32 %12, f2017, f2021;
add.f32 %13, f2124, f2123;
add.f32 %15, f2122, f2121;
add.f32 %14, f2025, f2029;
add.f32 %17, f1972, f1975;
sub.f32 %16, f1971, f1976;
add.f32 %19, f1980, f1983;
sub.f32 %18, f1979, f1984;
sub.f32 %20, f1987, f1992;
add.f32 %21, f1988, f1991;
sub.f32 %22, f1995, f2000;
add.f32 %23, f1996, f1999;
sub.f32 %24, f2003, f2008;
add.f32 %25, f2004, f2007;
sub.f32 %26, f2011, f2016;
add.f32 %27, f2012, f2015;
add.f32 %29, f2020, f2023;
sub.f32 %28, f2019, f2024;
add.f32 %31, f2028, f2031;
sub.f32 %30, f2027, f2032;
sub.f32 %33, f2136, f2135;
sub.f32 %32, f1969, f1973;
sub.f32 %35, f2134, f2133;
sub.f32 %34, f1977, f1981;
sub.f32 %37, f2132, f2131;
sub.f32 %36, f1985, f1989;
sub.f32 %39, f2130, f2129;
sub.f32 %38, f1993, f1997;
sub.f32 %41, f2128, f2127;
sub.f32 %40, f2001, f2005;
sub.f32 %43, f2126, f2125;
sub.f32 %42, f2009, f2013;
sub.f32 %45, f2124, f2123;
sub.f32 %44, f2017, f2021;
sub.f32 %47, f2122, f2121;
sub.f32 %46, f2025, f2029;
sub.f32 %49, f1972, f1975;
add.f32 %48, f1971, f1976;
sub.f32 %51, f1980, f1983;
add.f32 %50, f1979, f1984;
sub.f32 %53, f1988, f1991;
add.f32 %52, f1987, f1992;
sub.f32 %55, f1996, f1999;
add.f32 %54, f1995, f2000;
sub.f32 %57, f2004, f2007;
add.f32 %56, f2003, f2008;
sub.f32 %59, f2012, f2015;
add.f32 %58, f2011, f2016;
sub.f32 %61, f2020, f2023;
add.f32 %60, f2019, f2024;
sub.f32 %63, f2028, f2031;
add.f32 %62, f2027, f2032;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_4096), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<304, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<564>;
.reg .b32 r<27>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 15;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f33, %20, %30;
add.f32 f34, %21, %32;
sub.f32 f35, %20, %30;
sub.f32 f36, %21, %32;
add.f32 f37, %25, %36;
add.f32 f38, %27, %37;
sub.f32 f39, %25, %36;
sub.f32 f40, %27, %37;
add.f32 f41, f33, f37;
add.f32 f42, f34, f38;
sub.f32 f43, f33, f37;
sub.f32 f44, f34, f38;
sub.f32 f45, f35, f40;
add.f32 f46, f36, f39;
add.f32 f47, f35, f40;
sub.f32 f48, f36, f39;
add.f32 f49, %22, %33;
add.f32 f50, %24, %35;
sub.f32 f51, %22, %33;
sub.f32 f52, %24, %35;
add.f32 f53, %28, %38;
add.f32 f54, %29, %39;
sub.f32 f55, %28, %38;
sub.f32 f56, %29, %39;
add.f32 f57, f49, f53;
add.f32 f58, f50, f54;
sub.f32 f59, f49, f53;
sub.f32 f60, f50, f54;
sub.f32 f61, f51, f56;
add.f32 f62, f52, f55;
add.f32 f63, f51, f56;
sub.f32 f64, f52, f55;
mul.f32 f65, f61, 0f3F3504F3;
mul.f32 f66, f62, 0f3F3504F3;
sub.f32 f67, f65, f66;
add.f32 f68, f65, f66;
mul.f32 f69, f63, 0fBF3504F3;
mul.f32 f70, f64, 0f3F3504F3;
sub.f32 f71, f69, f70;
mul.f32 f72, f64, 0fBF3504F3;
fma.rn.f32 f73, f63, 0f3F3504F3, f72;
sub.f32 f74, f41, f57;
sub.f32 f75, f42, f58;
add.f32 f76, f45, f67;
add.f32 f77, f46, f68;
sub.f32 f78, f45, f67;
sub.f32 f79, f46, f68;
sub.f32 f80, f43, f60;
add.f32 f81, f44, f59;
add.f32 f82, f43, f60;
sub.f32 f83, f44, f59;
add.f32 f84, f47, f71;
add.f32 f85, f48, f73;
sub.f32 f86, f47, f71;
sub.f32 f87, f48, f73;
and.b32 r6, r5, 511;
shl.b32 r7, r5, 6;
and.b32 r8, r7, -32768;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 4088;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f88, f89}, [rd5];
mul.f32 f92, f77, f89;
mul.f32 f93, f76, f89;
mul.f32 f94, f88, f77;
mul.f32 f95, f88, f88;
mul.f32 f96, f89, f89;
sub.f32 f97, f95, f96;
mul.f32 f98, f89, f88;
fma.rn.f32 f99, f89, f88, f98;
mul.f32 f100, f81, f99;
mul.f32 f101, f80, f99;
mul.f32 f102, f97, f81;
mul.f32 f103, f88, f97;
mul.f32 f104, f89, f99;
sub.f32 f105, f103, f104;
mul.f32 f106, f88, f99;
fma.rn.f32 f107, f89, f97, f106;
mul.f32 f108, f85, f107;
mul.f32 f109, f84, f107;
mul.f32 f110, f105, f85;
mul.f32 f111, f88, f105;
mul.f32 f112, f89, f107;
sub.f32 f113, f111, f112;
mul.f32 f114, f88, f107;
fma.rn.f32 f115, f89, f105, f114;
mul.f32 f116, f75, f115;
mul.f32 f117, f74, f115;
mul.f32 f118, f113, f75;
mul.f32 f119, f88, f113;
mul.f32 f120, f89, f115;
sub.f32 f121, f119, f120;
mul.f32 f122, f88, f115;
fma.rn.f32 f123, f89, f113, f122;
mul.f32 f124, f79, f123;
mul.f32 f125, f78, f123;
mul.f32 f126, f121, f79;
mul.f32 f127, f88, f121;
mul.f32 f128, f89, f123;
sub.f32 f129, f127, f128;
mul.f32 f130, f88, f123;
fma.rn.f32 f131, f89, f121, f130;
mul.f32 f132, f83, f131;
mul.f32 f133, f82, f131;
mul.f32 f134, f129, f83;
mul.f32 f135, f88, f129;
mul.f32 f136, f89, f131;
sub.f32 f137, f135, f136;
mul.f32 f138, f88, f131;
fma.rn.f32 f139, f89, f129, f138;
mul.f32 f140, f87, f139;
mul.f32 f141, f86, f139;
mul.f32 f142, f137, f87;
barrier.sync 0;
and.b32 r11, r7, 32704;
add.s32 r12, r9, r11;
add.f32 f143, f42, f58;
add.f32 f144, f41, f57;
fma.rn.f32 f145, f88, f76, f92;
sub.f32 f146, f94, f93;
st.shared.v4.f32 [r12], {f144, f143, f145, f146};
fma.rn.f32 f147, f97, f80, f100;
sub.f32 f148, f102, f101;
sub.f32 f149, f110, f109;
fma.rn.f32 f150, f105, f84, f108;
st.shared.v4.f32 [r12+16], {f147, f148, f150, f149};
fma.rn.f32 f151, f113, f74, f116;
sub.f32 f152, f118, f117;
fma.rn.f32 f153, f121, f78, f124;
sub.f32 f154, f126, f125;
st.shared.v4.f32 [r12+32], {f151, f152, f153, f154};
fma.rn.f32 f155, f129, f82, f132;
sub.f32 f156, f134, f133;
fma.rn.f32 f157, f137, f86, f140;
sub.f32 f158, f142, f141;
st.shared.v4.f32 [r12+48], {f155, f156, f157, f158};
barrier.sync 0;
mad.lo.s32 r13, r6, -56, r12;
ld.shared.v2.f32 {f159, f160}, [r13];
ld.shared.v2.f32 {f163, f164}, [r13+4096];
ld.shared.v2.f32 {f167, f168}, [r13+8192];
ld.shared.v2.f32 {f171, f172}, [r13+12288];
ld.shared.v2.f32 {f175, f176}, [r13+16384];
ld.shared.v2.f32 {f179, f180}, [r13+20480];
ld.shared.v2.f32 {f183, f184}, [r13+24576];
ld.shared.v2.f32 {f187, f188}, [r13+28672];
add.f32 f191, f159, f175;
add.f32 f192, f160, f176;
sub.f32 f193, f159, f175;
sub.f32 f194, f160, f176;
add.f32 f195, f167, f183;
add.f32 f196, f168, f184;
sub.f32 f197, f167, f183;
sub.f32 f198, f168, f184;
add.f32 f199, f191, f195;
add.f32 f200, f192, f196;
sub.f32 f201, f191, f195;
sub.f32 f202, f192, f196;
sub.f32 f203, f193, f198;
add.f32 f204, f194, f197;
add.f32 f205, f193, f198;
sub.f32 f206, f194, f197;
add.f32 f207, f163, f179;
add.f32 f208, f164, f180;
sub.f32 f209, f163, f179;
sub.f32 f210, f164, f180;
add.f32 f211, f171, f187;
add.f32 f212, f172, f188;
sub.f32 f213, f171, f187;
sub.f32 f214, f172, f188;
add.f32 f215, f207, f211;
add.f32 f216, f208, f212;
sub.f32 f217, f207, f211;
sub.f32 f218, f208, f212;
sub.f32 f219, f209, f214;
add.f32 f220, f210, f213;
add.f32 f221, f209, f214;
sub.f32 f222, f210, f213;
mul.f32 f223, f219, 0f3F3504F3;
mul.f32 f224, f220, 0f3F3504F3;
sub.f32 f225, f223, f224;
add.f32 f226, f223, f224;
mul.f32 f227, f221, 0fBF3504F3;
mul.f32 f228, f222, 0f3F3504F3;
sub.f32 f229, f227, f228;
mul.f32 f230, f222, 0fBF3504F3;
fma.rn.f32 f231, f221, 0f3F3504F3, f230;
sub.f32 f232, f199, f215;
sub.f32 f233, f200, f216;
add.f32 f234, f203, f225;
add.f32 f235, f204, f226;
sub.f32 f236, f203, f225;
sub.f32 f237, f204, f226;
sub.f32 f238, f201, f218;
add.f32 f239, f202, f217;
add.f32 f240, f201, f218;
sub.f32 f241, f202, f217;
add.f32 f242, f205, f229;
add.f32 f243, f206, f231;
sub.f32 f244, f205, f229;
sub.f32 f245, f206, f231;
and.b32 r14, r5, 504;
cvt.u64.u32 rd6, r14;
mov.u64 rd7, %18;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f246, f247}, [rd8];
mul.f32 f250, f235, f247;
mul.f32 f251, f234, f247;
mul.f32 f252, f246, f235;
mul.f32 f253, f246, f246;
mul.f32 f254, f247, f247;
sub.f32 f255, f253, f254;
mul.f32 f256, f247, f246;
fma.rn.f32 f257, f247, f246, f256;
mul.f32 f258, f239, f257;
mul.f32 f259, f238, f257;
mul.f32 f260, f255, f239;
mul.f32 f261, f246, f255;
mul.f32 f262, f247, f257;
sub.f32 f263, f261, f262;
mul.f32 f264, f246, f257;
fma.rn.f32 f265, f247, f255, f264;
mul.f32 f266, f243, f265;
mul.f32 f267, f242, f265;
mul.f32 f268, f263, f243;
mul.f32 f269, f246, f263;
mul.f32 f270, f247, f265;
sub.f32 f271, f269, f270;
mul.f32 f272, f246, f265;
fma.rn.f32 f273, f247, f263, f272;
mul.f32 f274, f233, f273;
mul.f32 f275, f232, f273;
mul.f32 f276, f271, f233;
mul.f32 f277, f246, f271;
mul.f32 f278, f247, f273;
sub.f32 f279, f277, f278;
mul.f32 f280, f246, f273;
fma.rn.f32 f281, f247, f271, f280;
mul.f32 f282, f237, f281;
mul.f32 f283, f236, f281;
mul.f32 f284, f279, f237;
mul.f32 f285, f246, f279;
mul.f32 f286, f247, f281;
sub.f32 f287, f285, f286;
mul.f32 f288, f246, f281;
fma.rn.f32 f289, f247, f279, f288;
mul.f32 f290, f241, f289;
mul.f32 f291, f240, f289;
mul.f32 f292, f287, f241;
mul.f32 f293, f246, f287;
mul.f32 f294, f247, f289;
sub.f32 f295, f293, f294;
mul.f32 f296, f246, f289;
fma.rn.f32 f297, f247, f287, f296;
mul.f32 f298, f245, f297;
mul.f32 f299, f244, f297;
mul.f32 f300, f295, f245;
and.b32 r15, r10, 56;
add.s32 r16, r9, r15;
barrier.sync 0;
and.b32 r17, r7, 32256;
add.s32 r18, r16, r17;
add.f32 f301, f200, f216;
add.f32 f302, f199, f215;
st.shared.v2.f32 [r18], {f302, f301};
fma.rn.f32 f303, f246, f234, f250;
sub.f32 f304, f252, f251;
st.shared.v2.f32 [r18+64], {f303, f304};
fma.rn.f32 f305, f255, f238, f258;
sub.f32 f306, f260, f259;
st.shared.v2.f32 [r18+128], {f305, f306};
fma.rn.f32 f307, f263, f242, f266;
sub.f32 f308, f268, f267;
st.shared.v2.f32 [r18+192], {f307, f308};
sub.f32 f309, f276, f275;
fma.rn.f32 f310, f271, f232, f274;
st.shared.v2.f32 [r18+256], {f310, f309};
fma.rn.f32 f311, f279, f236, f282;
sub.f32 f312, f284, f283;
st.shared.v2.f32 [r18+320], {f311, f312};
fma.rn.f32 f313, f287, f240, f290;
sub.f32 f314, f292, f291;
st.shared.v2.f32 [r18+384], {f313, f314};
fma.rn.f32 f315, f295, f244, f298;
sub.f32 f316, f300, f299;
st.shared.v2.f32 [r18+448], {f315, f316};
barrier.sync 0;
mad.lo.s32 r19, r14, -56, r18;
ld.shared.v2.f32 {f317, f318}, [r19];
ld.shared.v2.f32 {f321, f322}, [r19+4096];
ld.shared.v2.f32 {f325, f326}, [r19+8192];
ld.shared.v2.f32 {f329, f330}, [r19+12288];
ld.shared.v2.f32 {f333, f334}, [r19+16384];
ld.shared.v2.f32 {f337, f338}, [r19+20480];
ld.shared.v2.f32 {f341, f342}, [r19+24576];
ld.shared.v2.f32 {f345, f346}, [r19+28672];
add.f32 f349, f317, f333;
add.f32 f350, f318, f334;
sub.f32 f351, f317, f333;
sub.f32 f352, f318, f334;
add.f32 f353, f325, f341;
add.f32 f354, f326, f342;
sub.f32 f355, f325, f341;
sub.f32 f356, f326, f342;
add.f32 f357, f349, f353;
add.f32 f358, f350, f354;
sub.f32 f359, f349, f353;
sub.f32 f360, f350, f354;
sub.f32 f361, f351, f356;
add.f32 f362, f352, f355;
add.f32 f363, f351, f356;
sub.f32 f364, f352, f355;
add.f32 f365, f321, f337;
add.f32 f366, f322, f338;
sub.f32 f367, f321, f337;
sub.f32 f368, f322, f338;
add.f32 f369, f329, f345;
add.f32 f370, f330, f346;
sub.f32 f371, f329, f345;
sub.f32 f372, f330, f346;
add.f32 f373, f365, f369;
add.f32 f374, f366, f370;
sub.f32 f375, f365, f369;
sub.f32 f376, f366, f370;
sub.f32 f377, f367, f372;
add.f32 f378, f368, f371;
add.f32 f379, f367, f372;
sub.f32 f380, f368, f371;
mul.f32 f381, f377, 0f3F3504F3;
mul.f32 f382, f378, 0f3F3504F3;
sub.f32 f383, f381, f382;
add.f32 f384, f381, f382;
mul.f32 f385, f379, 0fBF3504F3;
mul.f32 f386, f380, 0f3F3504F3;
sub.f32 f387, f385, f386;
mul.f32 f388, f380, 0fBF3504F3;
fma.rn.f32 f389, f379, 0f3F3504F3, f388;
sub.f32 f390, f357, f373;
sub.f32 f391, f358, f374;
add.f32 f392, f361, f383;
add.f32 f393, f362, f384;
sub.f32 f394, f361, f383;
sub.f32 f395, f362, f384;
sub.f32 f396, f359, f376;
add.f32 f397, f360, f375;
add.f32 f398, f359, f376;
sub.f32 f399, f360, f375;
add.f32 f400, f363, f387;
add.f32 f401, f364, f389;
sub.f32 f402, f363, f387;
sub.f32 f403, f364, f389;
and.b32 r20, r5, 448;
bfe.u32 r21, r5, 6, 3;
mul.wide.u32 rd9, r21, 8;
mov.u64 rd10, %19;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f404, f405}, [rd11];
mul.f32 f408, f393, f405;
mul.f32 f409, f392, f405;
mul.f32 f410, f404, f393;
mul.f32 f411, f404, f404;
mul.f32 f412, f405, f405;
sub.f32 f413, f411, f412;
mul.f32 f414, f405, f404;
fma.rn.f32 f415, f405, f404, f414;
mul.f32 f416, f397, f415;
mul.f32 f417, f396, f415;
mul.f32 f418, f413, f397;
mul.f32 f419, f404, f413;
mul.f32 f420, f405, f415;
sub.f32 f421, f419, f420;
mul.f32 f422, f404, f415;
fma.rn.f32 f423, f405, f413, f422;
mul.f32 f424, f401, f423;
mul.f32 f425, f400, f423;
mul.f32 f426, f421, f401;
mul.f32 f427, f404, f421;
mul.f32 f428, f405, f423;
sub.f32 f429, f427, f428;
mul.f32 f430, f404, f423;
fma.rn.f32 f431, f405, f421, f430;
mul.f32 f432, f391, f431;
mul.f32 f433, f390, f431;
mul.f32 f434, f429, f391;
mul.f32 f435, f404, f429;
mul.f32 f436, f405, f431;
sub.f32 f437, f435, f436;
mul.f32 f438, f404, f431;
fma.rn.f32 f439, f405, f429, f438;
mul.f32 f440, f395, f439;
mul.f32 f441, f394, f439;
mul.f32 f442, f437, f395;
mul.f32 f443, f404, f437;
mul.f32 f444, f405, f439;
sub.f32 f445, f443, f444;
mul.f32 f446, f404, f439;
fma.rn.f32 f447, f405, f437, f446;
mul.f32 f448, f399, f447;
mul.f32 f449, f398, f447;
mul.f32 f450, f445, f399;
mul.f32 f451, f404, f445;
mul.f32 f452, f405, f447;
sub.f32 f453, f451, f452;
mul.f32 f454, f404, f447;
fma.rn.f32 f455, f405, f445, f454;
mul.f32 f456, f403, f455;
mul.f32 f457, f402, f455;
mul.f32 f458, f453, f403;
and.b32 r22, r10, 504;
add.s32 r23, r9, r22;
barrier.sync 0;
and.b32 r24, r7, 28672;
add.s32 r25, r23, r24;
add.f32 f459, f358, f374;
add.f32 f460, f357, f373;
st.shared.v2.f32 [r25], {f460, f459};
fma.rn.f32 f461, f404, f392, f408;
sub.f32 f462, f410, f409;
st.shared.v2.f32 [r25+512], {f461, f462};
fma.rn.f32 f463, f413, f396, f416;
sub.f32 f464, f418, f417;
st.shared.v2.f32 [r25+1024], {f463, f464};
fma.rn.f32 f465, f421, f400, f424;
sub.f32 f466, f426, f425;
st.shared.v2.f32 [r25+1536], {f465, f466};
sub.f32 f467, f434, f433;
fma.rn.f32 f468, f429, f390, f432;
st.shared.v2.f32 [r25+2048], {f468, f467};
fma.rn.f32 f469, f437, f394, f440;
sub.f32 f470, f442, f441;
st.shared.v2.f32 [r25+2560], {f469, f470};
fma.rn.f32 f471, f445, f398, f448;
sub.f32 f472, f450, f449;
st.shared.v2.f32 [r25+3072], {f471, f472};
fma.rn.f32 f473, f453, f402, f456;
sub.f32 f474, f458, f457;
st.shared.v2.f32 [r25+3584], {f473, f474};
barrier.sync 0;
mad.lo.s32 r26, r20, -56, r25;
ld.shared.v2.f32 {f475, f476}, [r26];
ld.shared.v2.f32 {f479, f480}, [r26+4096];
ld.shared.v2.f32 {f483, f484}, [r26+8192];
ld.shared.v2.f32 {f487, f488}, [r26+12288];
ld.shared.v2.f32 {f491, f492}, [r26+16384];
ld.shared.v2.f32 {f495, f496}, [r26+20480];
ld.shared.v2.f32 {f499, f500}, [r26+24576];
ld.shared.v2.f32 {f503, f504}, [r26+28672];
add.f32 f507, f475, f491;
add.f32 f508, f476, f492;
sub.f32 f509, f475, f491;
sub.f32 f510, f476, f492;
add.f32 f511, f483, f499;
add.f32 f512, f484, f500;
sub.f32 f513, f483, f499;
sub.f32 f514, f484, f500;
add.f32 f515, f507, f511;
add.f32 f516, f508, f512;
sub.f32 f517, f507, f511;
sub.f32 f518, f508, f512;
sub.f32 f519, f509, f514;
add.f32 f520, f510, f513;
add.f32 f521, f509, f514;
sub.f32 f522, f510, f513;
add.f32 f523, f479, f495;
add.f32 f524, f480, f496;
sub.f32 f525, f479, f495;
sub.f32 f526, f480, f496;
add.f32 f527, f487, f503;
add.f32 f528, f488, f504;
sub.f32 f529, f487, f503;
sub.f32 f530, f488, f504;
add.f32 f531, f523, f527;
add.f32 f532, f524, f528;
sub.f32 f533, f523, f527;
sub.f32 f534, f524, f528;
sub.f32 f535, f525, f530;
add.f32 f536, f526, f529;
add.f32 f537, f525, f530;
sub.f32 f538, f526, f529;
mul.f32 f539, f535, 0f3F3504F3;
mul.f32 f540, f536, 0f3F3504F3;
sub.f32 f541, f539, f540;
add.f32 f542, f539, f540;
mul.f32 f543, f537, 0fBF3504F3;
mul.f32 f544, f538, 0f3F3504F3;
sub.f32 f545, f543, f544;
mul.f32 f546, f538, 0fBF3504F3;
fma.rn.f32 f547, f537, 0f3F3504F3, f546;
add.f32 %1, f516, f532;
add.f32 %0, f515, f531;
add.f32 %3, f520, f542;
add.f32 %2, f519, f541;
add.f32 %5, f518, f533;
sub.f32 %4, f517, f534;
add.f32 %7, f522, f547;
add.f32 %6, f521, f545;
sub.f32 %9, f516, f532;
sub.f32 %8, f515, f531;
sub.f32 %11, f520, f542;
sub.f32 %10, f519, f541;
sub.f32 %13, f518, f533;
add.f32 %12, f517, f534;
sub.f32 %15, f522, f547;
sub.f32 %14, f521, f545;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_4096), "l"(lut_sp_8_512), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<308, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<2353>;
.reg .b32 r<36>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 14;
mov.u32 r3, %64;
add.s32 r4, r3, r2;
add.f32 f129, %67, %99;
sub.f32 f131, %67, %99;
add.f32 f2351, %68, %131;
sub.f32 f132, %68, %131;
add.f32 f133, %83, %115;
sub.f32 f135, %83, %115;
add.f32 f2349, %132, %116;
sub.f32 f136, %132, %116;
add.f32 f137, f129, f133;
sub.f32 f139, f129, f133;
add.f32 f2348, f2351, f2349;
sub.f32 f140, f2351, f2349;
sub.f32 f141, f131, f136;
add.f32 f143, f131, f136;
add.f32 f2347, f132, f135;
sub.f32 f144, f132, f135;
add.f32 f145, %75, %107;
sub.f32 f147, %75, %107;
add.f32 f2344, %133, %134;
sub.f32 f148, %133, %134;
add.f32 f149, %91, %123;
sub.f32 f151, %91, %123;
add.f32 f2342, %92, %135;
sub.f32 f152, %92, %135;
add.f32 f153, f145, f149;
sub.f32 f155, f145, f149;
add.f32 f2341, f2344, f2342;
sub.f32 f156, f2344, f2342;
sub.f32 f157, f147, f152;
add.f32 f159, f147, f152;
add.f32 f2340, f148, f151;
sub.f32 f160, f148, f151;
mul.f32 f161, f157, 0f3F3504F3;
mul.f32 f162, f2340, 0f3F3504F3;
sub.f32 f163, f161, f162;
add.f32 f164, f161, f162;
mul.f32 f2338, f159, 0fBF3504F3;
mul.f32 f2339, f160, 0f3F3504F3;
sub.f32 f167, f2338, f2339;
mul.f32 f168, f160, 0fBF3504F3;
fma.rn.f32 f169, f159, 0f3F3504F3, f168;
add.f32 f170, f137, f153;
sub.f32 f172, f137, f153;
add.f32 f2337, f2348, f2341;
sub.f32 f173, f2348, f2341;
add.f32 f174, f141, f163;
sub.f32 f176, f141, f163;
add.f32 f2336, f2347, f164;
sub.f32 f177, f2347, f164;
sub.f32 f178, f139, f156;
add.f32 f180, f139, f156;
add.f32 f2335, f140, f155;
sub.f32 f181, f140, f155;
add.f32 f182, f143, f167;
sub.f32 f184, f143, f167;
add.f32 f2334, f144, f169;
sub.f32 f185, f144, f169;
add.f32 f186, %71, %103;
sub.f32 f188, %71, %103;
add.f32 f2332, %136, %104;
sub.f32 f189, %136, %104;
add.f32 f190, %87, %119;
sub.f32 f192, %87, %119;
add.f32 f2329, %138, %137;
sub.f32 f193, %138, %137;
add.f32 f194, f186, f190;
sub.f32 f196, f186, f190;
add.f32 f2328, f2332, f2329;
sub.f32 f197, f2332, f2329;
sub.f32 f198, f188, f193;
add.f32 f200, f188, f193;
add.f32 f2327, f189, f192;
sub.f32 f201, f189, f192;
add.f32 f202, %79, %111;
sub.f32 f204, %79, %111;
add.f32 f2325, %80, %139;
sub.f32 f205, %80, %139;
add.f32 f206, %95, %127;
sub.f32 f208, %95, %127;
add.f32 f2323, %140, %128;
sub.f32 f209, %140, %128;
add.f32 f210, f202, f206;
sub.f32 f212, f202, f206;
add.f32 f2322, f2325, f2323;
sub.f32 f213, f2325, f2323;
sub.f32 f214, f204, f209;
add.f32 f216, f204, f209;
add.f32 f2321, f205, f208;
sub.f32 f217, f205, f208;
mul.f32 f218, f214, 0f3F3504F3;
mul.f32 f219, f2321, 0f3F3504F3;
sub.f32 f220, f218, f219;
add.f32 f221, f218, f219;
mul.f32 f2319, f216, 0fBF3504F3;
mul.f32 f2320, f217, 0f3F3504F3;
sub.f32 f224, f2319, f2320;
mul.f32 f225, f217, 0fBF3504F3;
fma.rn.f32 f226, f216, 0f3F3504F3, f225;
add.f32 f227, f194, f210;
sub.f32 f229, f194, f210;
add.f32 f2318, f2328, f2322;
sub.f32 f230, f2328, f2322;
add.f32 f231, f198, f220;
sub.f32 f233, f198, f220;
add.f32 f2317, f2327, f221;
sub.f32 f234, f2327, f221;
sub.f32 f235, f196, f213;
add.f32 f237, f196, f213;
add.f32 f2316, f197, f212;
sub.f32 f238, f197, f212;
add.f32 f239, f200, f224;
sub.f32 f241, f200, f224;
add.f32 f2315, f201, f226;
sub.f32 f242, f201, f226;
mul.f32 f2313, f231, 0f3F6C835E;
mul.f32 f2314, f2317, 0f3EC3EF15;
sub.f32 f245, f2313, f2314;
mul.f32 f246, f2317, 0f3F6C835E;
fma.rn.f32 f247, f231, 0f3EC3EF15, f246;
mul.f32 f248, f235, 0f3F3504F3;
mul.f32 f249, f2316, 0f3F3504F3;
sub.f32 f250, f248, f249;
add.f32 f251, f248, f249;
mul.f32 f253, f2315, 0f3F6C835E;
mul.f32 f2312, f239, 0f3EC3EF15;
sub.f32 f254, f2312, f253;
mul.f32 f255, f2315, 0f3EC3EF15;
fma.rn.f32 f256, f239, 0f3F6C835E, f255;
mul.f32 f258, f234, 0f3F6C835E;
mul.f32 f2311, f233, 0fBEC3EF15;
sub.f32 f259, f2311, f258;
mul.f32 f260, f234, 0fBEC3EF15;
fma.rn.f32 f261, f233, 0f3F6C835E, f260;
mul.f32 f2309, f237, 0fBF3504F3;
mul.f32 f2310, f238, 0f3F3504F3;
sub.f32 f264, f2309, f2310;
mul.f32 f265, f238, 0fBF3504F3;
fma.rn.f32 f266, f237, 0f3F3504F3, f265;
mul.f32 f2307, f241, 0fBF6C835E;
mul.f32 f2308, f242, 0f3EC3EF15;
sub.f32 f269, f2307, f2308;
mul.f32 f270, f242, 0fBF6C835E;
fma.rn.f32 f271, f241, 0f3EC3EF15, f270;
add.f32 f272, f170, f227;
sub.f32 f274, f170, f227;
add.f32 f2306, f2337, f2318;
sub.f32 f275, f2337, f2318;
add.f32 f276, f174, f245;
sub.f32 f278, f174, f245;
add.f32 f2305, f2336, f247;
sub.f32 f279, f2336, f247;
add.f32 f280, f178, f250;
sub.f32 f282, f178, f250;
add.f32 f2304, f2335, f251;
sub.f32 f283, f2335, f251;
add.f32 f284, f182, f254;
sub.f32 f286, f182, f254;
add.f32 f2303, f2334, f256;
sub.f32 f287, f2334, f256;
sub.f32 f288, f172, f230;
add.f32 f290, f172, f230;
add.f32 f2302, f173, f229;
sub.f32 f291, f173, f229;
add.f32 f292, f176, f259;
sub.f32 f294, f176, f259;
add.f32 f2301, f177, f261;
sub.f32 f295, f177, f261;
add.f32 f296, f180, f264;
sub.f32 f298, f180, f264;
add.f32 f2300, f181, f266;
sub.f32 f299, f181, f266;
add.f32 f300, f184, f269;
sub.f32 f302, f184, f269;
add.f32 f2299, f185, f271;
sub.f32 f303, f185, f271;
add.f32 f304, %69, %101;
sub.f32 f306, %69, %101;
add.f32 f2296, %142, %141;
sub.f32 f307, %142, %141;
add.f32 f308, %85, %117;
sub.f32 f310, %85, %117;
add.f32 f2294, %86, %143;
sub.f32 f311, %86, %143;
add.f32 f312, f304, f308;
sub.f32 f314, f304, f308;
add.f32 f2293, f2296, f2294;
sub.f32 f315, f2296, f2294;
sub.f32 f316, f306, f311;
add.f32 f318, f306, f311;
add.f32 f2292, f307, f310;
sub.f32 f319, f307, f310;
add.f32 f320, %77, %109;
sub.f32 f322, %77, %109;
add.f32 f2290, %144, %110;
sub.f32 f323, %144, %110;
add.f32 f324, %93, %125;
sub.f32 f326, %93, %125;
add.f32 f2287, %146, %145;
sub.f32 f327, %146, %145;
add.f32 f328, f320, f324;
sub.f32 f330, f320, f324;
add.f32 f2286, f2290, f2287;
sub.f32 f331, f2290, f2287;
sub.f32 f332, f322, f327;
add.f32 f334, f322, f327;
add.f32 f2285, f323, f326;
sub.f32 f335, f323, f326;
mul.f32 f336, f332, 0f3F3504F3;
mul.f32 f337, f2285, 0f3F3504F3;
sub.f32 f338, f336, f337;
add.f32 f339, f336, f337;
mul.f32 f341, f335, 0f3F3504F3;
mul.f32 f2284, f334, 0fBF3504F3;
sub.f32 f342, f2284, f341;
mul.f32 f343, f335, 0fBF3504F3;
fma.rn.f32 f344, f334, 0f3F3504F3, f343;
add.f32 f345, f312, f328;
sub.f32 f347, f312, f328;
add.f32 f2283, f2293, f2286;
sub.f32 f348, f2293, f2286;
add.f32 f349, f316, f338;
sub.f32 f351, f316, f338;
add.f32 f2282, f2292, f339;
sub.f32 f352, f2292, f339;
sub.f32 f353, f314, f331;
add.f32 f355, f314, f331;
add.f32 f2281, f315, f330;
sub.f32 f356, f315, f330;
add.f32 f357, f318, f342;
sub.f32 f359, f318, f342;
add.f32 f2280, f319, f344;
sub.f32 f360, f319, f344;
add.f32 f361, %73, %105;
sub.f32 f363, %73, %105;
add.f32 f2278, %74, %147;
sub.f32 f364, %74, %147;
add.f32 f365, %89, %121;
sub.f32 f367, %89, %121;
add.f32 f2276, %148, %122;
sub.f32 f368, %148, %122;
add.f32 f369, f361, f365;
sub.f32 f371, f361, f365;
add.f32 f2275, f2278, f2276;
sub.f32 f372, f2278, f2276;
sub.f32 f373, f363, f368;
add.f32 f375, f363, f368;
add.f32 f2274, f364, f367;
sub.f32 f376, f364, f367;
add.f32 f377, %81, %113;
sub.f32 f379, %81, %113;
add.f32 f2271, %149, %150;
sub.f32 f380, %149, %150;
add.f32 f381, %97, %129;
sub.f32 f383, %97, %129;
add.f32 f2270, %98, %130;
sub.f32 f384, %98, %130;
add.f32 f385, f377, f381;
sub.f32 f387, f377, f381;
add.f32 f2269, f2271, f2270;
sub.f32 f388, f2271, f2270;
sub.f32 f389, f379, f384;
add.f32 f391, f379, f384;
add.f32 f2268, f380, f383;
sub.f32 f392, f380, f383;
mul.f32 f393, f389, 0f3F3504F3;
mul.f32 f394, f2268, 0f3F3504F3;
sub.f32 f395, f393, f394;
add.f32 f396, f393, f394;
mul.f32 f2266, f391, 0fBF3504F3;
mul.f32 f2267, f392, 0f3F3504F3;
sub.f32 f399, f2266, f2267;
mul.f32 f400, f392, 0fBF3504F3;
fma.rn.f32 f401, f391, 0f3F3504F3, f400;
add.f32 f402, f369, f385;
sub.f32 f404, f369, f385;
add.f32 f2265, f2275, f2269;
sub.f32 f405, f2275, f2269;
add.f32 f406, f373, f395;
sub.f32 f408, f373, f395;
add.f32 f2264, f2274, f396;
sub.f32 f409, f2274, f396;
sub.f32 f410, f371, f388;
add.f32 f412, f371, f388;
add.f32 f2263, f372, f387;
sub.f32 f413, f372, f387;
add.f32 f414, f375, f399;
sub.f32 f416, f375, f399;
add.f32 f2262, f376, f401;
sub.f32 f417, f376, f401;
mul.f32 f419, f2264, 0f3EC3EF15;
mul.f32 f2261, f406, 0f3F6C835E;
sub.f32 f420, f2261, f419;
mul.f32 f421, f2264, 0f3F6C835E;
fma.rn.f32 f422, f406, 0f3EC3EF15, f421;
mul.f32 f423, f410, 0f3F3504F3;
mul.f32 f424, f2263, 0f3F3504F3;
sub.f32 f425, f423, f424;
add.f32 f426, f423, f424;
mul.f32 f428, f2262, 0f3F6C835E;
mul.f32 f2260, f414, 0f3EC3EF15;
sub.f32 f429, f2260, f428;
mul.f32 f430, f2262, 0f3EC3EF15;
fma.rn.f32 f431, f414, 0f3F6C835E, f430;
mul.f32 f433, f409, 0f3F6C835E;
mul.f32 f2259, f408, 0fBEC3EF15;
sub.f32 f434, f2259, f433;
mul.f32 f435, f409, 0fBEC3EF15;
fma.rn.f32 f436, f408, 0f3F6C835E, f435;
mul.f32 f438, f413, 0f3F3504F3;
mul.f32 f2258, f412, 0fBF3504F3;
sub.f32 f439, f2258, f438;
mul.f32 f440, f413, 0fBF3504F3;
fma.rn.f32 f441, f412, 0f3F3504F3, f440;
mul.f32 f443, f417, 0f3EC3EF15;
mul.f32 f2257, f416, 0fBF6C835E;
sub.f32 f444, f2257, f443;
mul.f32 f445, f417, 0fBF6C835E;
fma.rn.f32 f446, f416, 0f3EC3EF15, f445;
add.f32 f447, f345, f402;
sub.f32 f449, f345, f402;
add.f32 f2256, f2283, f2265;
sub.f32 f450, f2283, f2265;
add.f32 f451, f349, f420;
sub.f32 f453, f349, f420;
add.f32 f2255, f2282, f422;
sub.f32 f454, f2282, f422;
add.f32 f455, f353, f425;
sub.f32 f457, f353, f425;
add.f32 f2254, f2281, f426;
sub.f32 f458, f2281, f426;
add.f32 f459, f357, f429;
sub.f32 f461, f357, f429;
add.f32 f2253, f2280, f431;
sub.f32 f462, f2280, f431;
sub.f32 f463, f347, f405;
add.f32 f465, f347, f405;
add.f32 f2252, f348, f404;
sub.f32 f466, f348, f404;
add.f32 f467, f351, f434;
sub.f32 f469, f351, f434;
add.f32 f2251, f352, f436;
sub.f32 f470, f352, f436;
add.f32 f471, f355, f439;
sub.f32 f473, f355, f439;
add.f32 f2250, f356, f441;
sub.f32 f474, f356, f441;
add.f32 f475, f359, f444;
sub.f32 f477, f359, f444;
add.f32 f2249, f360, f446;
sub.f32 f478, f360, f446;
mul.f32 f480, f2255, 0f3E47C5C2;
mul.f32 f2248, f451, 0f3F7B14BE;
sub.f32 f481, f2248, f480;
mul.f32 f482, f2255, 0f3F7B14BE;
fma.rn.f32 f483, f451, 0f3E47C5C2, f482;
mul.f32 f485, f2254, 0f3EC3EF15;
mul.f32 f2247, f455, 0f3F6C835E;
sub.f32 f486, f2247, f485;
mul.f32 f487, f2254, 0f3F6C835E;
fma.rn.f32 f488, f455, 0f3EC3EF15, f487;
mul.f32 f490, f2253, 0f3F0E39DA;
mul.f32 f2246, f459, 0f3F54DB31;
sub.f32 f491, f2246, f490;
mul.f32 f492, f2253, 0f3F54DB31;
fma.rn.f32 f493, f459, 0f3F0E39DA, f492;
mul.f32 f494, f463, 0f3F3504F3;
mul.f32 f495, f2252, 0f3F3504F3;
sub.f32 f496, f494, f495;
add.f32 f497, f494, f495;
mul.f32 f499, f2251, 0f3F54DB31;
mul.f32 f2245, f467, 0f3F0E39DA;
sub.f32 f500, f2245, f499;
mul.f32 f501, f2251, 0f3F0E39DA;
fma.rn.f32 f502, f467, 0f3F54DB31, f501;
mul.f32 f504, f2250, 0f3F6C835E;
mul.f32 f2244, f471, 0f3EC3EF15;
sub.f32 f505, f2244, f504;
mul.f32 f506, f2250, 0f3EC3EF15;
fma.rn.f32 f507, f471, 0f3F6C835E, f506;
mul.f32 f509, f2249, 0f3F7B14BE;
mul.f32 f2243, f475, 0f3E47C5C2;
sub.f32 f510, f2243, f509;
mul.f32 f511, f2249, 0f3E47C5C2;
fma.rn.f32 f512, f475, 0f3F7B14BE, f511;
mul.f32 f514, f454, 0f3F7B14BE;
mul.f32 f2242, f453, 0fBE47C5C2;
sub.f32 f515, f2242, f514;
mul.f32 f516, f454, 0fBE47C5C2;
fma.rn.f32 f517, f453, 0f3F7B14BE, f516;
mul.f32 f2240, f457, 0fBEC3EF15;
mul.f32 f2241, f458, 0f3F6C835E;
sub.f32 f520, f2240, f2241;
mul.f32 f521, f458, 0fBEC3EF15;
fma.rn.f32 f522, f457, 0f3F6C835E, f521;
mul.f32 f2238, f461, 0fBF0E39DA;
mul.f32 f2239, f462, 0f3F54DB31;
sub.f32 f525, f2238, f2239;
mul.f32 f526, f462, 0fBF0E39DA;
fma.rn.f32 f527, f461, 0f3F54DB31, f526;
mul.f32 f2236, f465, 0fBF3504F3;
mul.f32 f2237, f466, 0f3F3504F3;
sub.f32 f530, f2236, f2237;
mul.f32 f531, f466, 0fBF3504F3;
fma.rn.f32 f532, f465, 0f3F3504F3, f531;
mul.f32 f2234, f469, 0fBF54DB31;
mul.f32 f2235, f470, 0f3F0E39DA;
sub.f32 f535, f2234, f2235;
mul.f32 f536, f470, 0fBF54DB31;
fma.rn.f32 f537, f469, 0f3F0E39DA, f536;
mul.f32 f539, f474, 0f3EC3EF15;
mul.f32 f2233, f473, 0fBF6C835E;
sub.f32 f540, f2233, f539;
mul.f32 f541, f474, 0fBF6C835E;
fma.rn.f32 f542, f473, 0f3EC3EF15, f541;
mul.f32 f544, f478, 0f3E47C5C2;
mul.f32 f2232, f477, 0fBF7B14BE;
sub.f32 f545, f2232, f544;
mul.f32 f546, f478, 0fBF7B14BE;
fma.rn.f32 f547, f477, 0f3E47C5C2, f546;
add.f32 f548, f272, f447;
sub.f32 f550, f272, f447;
add.f32 f2231, f2306, f2256;
sub.f32 f551, f2306, f2256;
add.f32 f552, f276, f481;
sub.f32 f554, f276, f481;
add.f32 f2230, f2305, f483;
sub.f32 f555, f2305, f483;
add.f32 f556, f280, f486;
sub.f32 f558, f280, f486;
add.f32 f2229, f2304, f488;
sub.f32 f559, f2304, f488;
add.f32 f560, f284, f491;
sub.f32 f562, f284, f491;
add.f32 f2228, f2303, f493;
sub.f32 f563, f2303, f493;
add.f32 f564, f288, f496;
sub.f32 f566, f288, f496;
add.f32 f2227, f2302, f497;
sub.f32 f567, f2302, f497;
add.f32 f568, f292, f500;
sub.f32 f570, f292, f500;
add.f32 f2226, f2301, f502;
sub.f32 f571, f2301, f502;
add.f32 f572, f296, f505;
sub.f32 f574, f296, f505;
add.f32 f2225, f2300, f507;
sub.f32 f575, f2300, f507;
add.f32 f576, f300, f510;
sub.f32 f578, f300, f510;
add.f32 f2224, f2299, f512;
sub.f32 f579, f2299, f512;
sub.f32 f580, f274, f450;
add.f32 f582, f274, f450;
add.f32 f2223, f275, f449;
sub.f32 f583, f275, f449;
add.f32 f584, f278, f515;
sub.f32 f586, f278, f515;
add.f32 f2222, f279, f517;
sub.f32 f587, f279, f517;
add.f32 f588, f282, f520;
sub.f32 f590, f282, f520;
add.f32 f2221, f283, f522;
sub.f32 f591, f283, f522;
add.f32 f592, f286, f525;
sub.f32 f594, f286, f525;
add.f32 f2220, f287, f527;
sub.f32 f595, f287, f527;
add.f32 f596, f290, f530;
sub.f32 f598, f290, f530;
add.f32 f2219, f291, f532;
sub.f32 f599, f291, f532;
add.f32 f600, f294, f535;
sub.f32 f602, f294, f535;
add.f32 f2218, f295, f537;
sub.f32 f603, f295, f537;
add.f32 f604, f298, f540;
sub.f32 f606, f298, f540;
add.f32 f2217, f299, f542;
sub.f32 f607, f299, f542;
add.f32 f608, f302, f545;
sub.f32 f610, f302, f545;
add.f32 f2216, f303, f547;
sub.f32 f611, f303, f547;
mov.u32 r24, %tid.x;
shl.b32 r7, r24, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 1016;
mov.u64 rd4, %65;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f612, f613}, [rd5];
mul.f32 f616, f2230, f613;
fma.rn.f32 f617, f612, f552, f616;
mul.f32 f618, f552, f613;
mul.f32 f619, f612, f2230;
sub.f32 f620, f619, f618;
mul.f32 f2214, f612, f612;
mul.f32 f2215, f613, f613;
sub.f32 f623, f2214, f2215;
mul.f32 f624, f613, f612;
fma.rn.f32 f625, f613, f612, f624;
mul.f32 f626, f2229, f625;
fma.rn.f32 f627, f623, f556, f626;
mul.f32 f628, f556, f625;
mul.f32 f629, f623, f2229;
sub.f32 f630, f629, f628;
mul.f32 f2212, f612, f623;
mul.f32 f2213, f613, f625;
sub.f32 f633, f2212, f2213;
mul.f32 f634, f612, f625;
fma.rn.f32 f635, f613, f623, f634;
mul.f32 f636, f2228, f635;
fma.rn.f32 f637, f633, f560, f636;
mul.f32 f638, f560, f635;
mul.f32 f639, f633, f2228;
sub.f32 f640, f639, f638;
mul.f32 f642, f613, f635;
mul.f32 f2211, f612, f633;
sub.f32 f643, f2211, f642;
mul.f32 f644, f612, f635;
fma.rn.f32 f645, f613, f633, f644;
mul.f32 f646, f2227, f645;
fma.rn.f32 f647, f643, f564, f646;
mul.f32 f648, f564, f645;
mul.f32 f649, f643, f2227;
sub.f32 f650, f649, f648;
mul.f32 f652, f613, f645;
mul.f32 f2210, f612, f643;
sub.f32 f653, f2210, f652;
mul.f32 f654, f612, f645;
fma.rn.f32 f655, f613, f643, f654;
mul.f32 f656, f2226, f655;
fma.rn.f32 f657, f653, f568, f656;
mul.f32 f658, f568, f655;
mul.f32 f659, f653, f2226;
sub.f32 f660, f659, f658;
mul.f32 f662, f613, f655;
mul.f32 f2209, f612, f653;
sub.f32 f663, f2209, f662;
mul.f32 f664, f612, f655;
fma.rn.f32 f665, f613, f653, f664;
mul.f32 f666, f2225, f665;
fma.rn.f32 f667, f663, f572, f666;
mul.f32 f668, f572, f665;
mul.f32 f669, f663, f2225;
sub.f32 f670, f669, f668;
mul.f32 f2207, f612, f663;
mul.f32 f2208, f613, f665;
sub.f32 f673, f2207, f2208;
mul.f32 f674, f612, f665;
fma.rn.f32 f675, f613, f663, f674;
mul.f32 f676, f2224, f675;
fma.rn.f32 f677, f673, f576, f676;
mul.f32 f678, f576, f675;
mul.f32 f679, f673, f2224;
sub.f32 f680, f679, f678;
mul.f32 f2205, f612, f673;
mul.f32 f2206, f613, f675;
sub.f32 f683, f2205, f2206;
mul.f32 f684, f612, f675;
fma.rn.f32 f685, f613, f673, f684;
mul.f32 f686, f2223, f685;
fma.rn.f32 f687, f683, f580, f686;
mul.f32 f688, f580, f685;
mul.f32 f689, f683, f2223;
sub.f32 f690, f689, f688;
mul.f32 f692, f613, f685;
mul.f32 f2204, f612, f683;
sub.f32 f693, f2204, f692;
mul.f32 f694, f612, f685;
fma.rn.f32 f695, f613, f683, f694;
mul.f32 f696, f2222, f695;
fma.rn.f32 f697, f693, f584, f696;
mul.f32 f698, f584, f695;
mul.f32 f699, f693, f2222;
sub.f32 f700, f699, f698;
mul.f32 f702, f613, f695;
mul.f32 f2203, f612, f693;
sub.f32 f703, f2203, f702;
mul.f32 f704, f612, f695;
fma.rn.f32 f705, f613, f693, f704;
mul.f32 f706, f2221, f705;
fma.rn.f32 f707, f703, f588, f706;
mul.f32 f708, f588, f705;
mul.f32 f709, f703, f2221;
sub.f32 f710, f709, f708;
mul.f32 f2201, f612, f703;
mul.f32 f2202, f613, f705;
sub.f32 f713, f2201, f2202;
mul.f32 f714, f612, f705;
fma.rn.f32 f715, f613, f703, f714;
mul.f32 f716, f2220, f715;
fma.rn.f32 f717, f713, f592, f716;
mul.f32 f718, f592, f715;
mul.f32 f719, f713, f2220;
sub.f32 f720, f719, f718;
mul.f32 f2199, f612, f713;
mul.f32 f2200, f613, f715;
sub.f32 f723, f2199, f2200;
mul.f32 f724, f612, f715;
fma.rn.f32 f725, f613, f713, f724;
mul.f32 f726, f2219, f725;
fma.rn.f32 f727, f723, f596, f726;
mul.f32 f728, f596, f725;
mul.f32 f729, f723, f2219;
sub.f32 f730, f729, f728;
mul.f32 f732, f613, f725;
mul.f32 f2198, f612, f723;
sub.f32 f733, f2198, f732;
mul.f32 f734, f612, f725;
fma.rn.f32 f735, f613, f723, f734;
mul.f32 f736, f2218, f735;
fma.rn.f32 f737, f733, f600, f736;
mul.f32 f738, f600, f735;
mul.f32 f739, f733, f2218;
sub.f32 f740, f739, f738;
mul.f32 f742, f613, f735;
mul.f32 f2197, f612, f733;
sub.f32 f743, f2197, f742;
mul.f32 f744, f612, f735;
fma.rn.f32 f745, f613, f733, f744;
mul.f32 f746, f2217, f745;
fma.rn.f32 f747, f743, f604, f746;
mul.f32 f748, f604, f745;
mul.f32 f749, f743, f2217;
sub.f32 f750, f749, f748;
mul.f32 f752, f613, f745;
mul.f32 f2196, f612, f743;
sub.f32 f753, f2196, f752;
mul.f32 f754, f612, f745;
fma.rn.f32 f755, f613, f743, f754;
mul.f32 f756, f2216, f755;
fma.rn.f32 f757, f753, f608, f756;
mul.f32 f758, f608, f755;
mul.f32 f759, f753, f2216;
sub.f32 f760, f759, f758;
mul.f32 f2194, f612, f753;
mul.f32 f2195, f613, f755;
sub.f32 f763, f2194, f2195;
mul.f32 f764, f612, f755;
fma.rn.f32 f765, f613, f753, f764;
mul.f32 f766, f551, f765;
fma.rn.f32 f767, f763, f550, f766;
mul.f32 f768, f550, f765;
mul.f32 f769, f763, f551;
sub.f32 f770, f769, f768;
mul.f32 f2192, f612, f763;
mul.f32 f2193, f613, f765;
sub.f32 f773, f2192, f2193;
mul.f32 f774, f612, f765;
fma.rn.f32 f775, f613, f763, f774;
mul.f32 f776, f555, f775;
fma.rn.f32 f777, f773, f554, f776;
mul.f32 f778, f554, f775;
mul.f32 f779, f773, f555;
sub.f32 f780, f779, f778;
mul.f32 f782, f613, f775;
mul.f32 f2191, f612, f773;
sub.f32 f783, f2191, f782;
mul.f32 f784, f612, f775;
fma.rn.f32 f785, f613, f773, f784;
mul.f32 f786, f559, f785;
fma.rn.f32 f787, f783, f558, f786;
mul.f32 f788, f558, f785;
mul.f32 f789, f783, f559;
sub.f32 f790, f789, f788;
mul.f32 f792, f613, f785;
mul.f32 f2190, f612, f783;
sub.f32 f793, f2190, f792;
mul.f32 f794, f612, f785;
fma.rn.f32 f795, f613, f783, f794;
mul.f32 f796, f563, f795;
fma.rn.f32 f797, f793, f562, f796;
mul.f32 f798, f562, f795;
mul.f32 f799, f793, f563;
sub.f32 f800, f799, f798;
mul.f32 f802, f613, f795;
mul.f32 f2189, f612, f793;
sub.f32 f803, f2189, f802;
mul.f32 f804, f612, f795;
fma.rn.f32 f805, f613, f793, f804;
mul.f32 f806, f567, f805;
fma.rn.f32 f807, f803, f566, f806;
mul.f32 f808, f566, f805;
mul.f32 f809, f803, f567;
sub.f32 f810, f809, f808;
mul.f32 f2187, f612, f803;
mul.f32 f2188, f613, f805;
sub.f32 f813, f2187, f2188;
mul.f32 f814, f612, f805;
fma.rn.f32 f815, f613, f803, f814;
mul.f32 f816, f571, f815;
fma.rn.f32 f817, f813, f570, f816;
mul.f32 f818, f570, f815;
mul.f32 f819, f813, f571;
sub.f32 f820, f819, f818;
mul.f32 f2185, f612, f813;
mul.f32 f2186, f613, f815;
sub.f32 f823, f2185, f2186;
mul.f32 f824, f612, f815;
fma.rn.f32 f825, f613, f813, f824;
mul.f32 f826, f575, f825;
fma.rn.f32 f827, f823, f574, f826;
mul.f32 f828, f574, f825;
mul.f32 f829, f823, f575;
sub.f32 f830, f829, f828;
mul.f32 f832, f613, f825;
mul.f32 f2184, f612, f823;
sub.f32 f833, f2184, f832;
mul.f32 f834, f612, f825;
fma.rn.f32 f835, f613, f823, f834;
mul.f32 f836, f579, f835;
fma.rn.f32 f837, f833, f578, f836;
mul.f32 f838, f578, f835;
mul.f32 f839, f833, f579;
sub.f32 f840, f839, f838;
mul.f32 f842, f613, f835;
mul.f32 f2183, f612, f833;
sub.f32 f843, f2183, f842;
mul.f32 f844, f612, f835;
fma.rn.f32 f845, f613, f833, f844;
mul.f32 f846, f583, f845;
fma.rn.f32 f847, f843, f582, f846;
mul.f32 f848, f582, f845;
mul.f32 f849, f843, f583;
sub.f32 f850, f849, f848;
mul.f32 f2181, f612, f843;
mul.f32 f2182, f613, f845;
sub.f32 f853, f2181, f2182;
mul.f32 f854, f612, f845;
fma.rn.f32 f855, f613, f843, f854;
mul.f32 f856, f587, f855;
fma.rn.f32 f857, f853, f586, f856;
mul.f32 f858, f586, f855;
mul.f32 f859, f853, f587;
sub.f32 f860, f859, f858;
mul.f32 f2179, f612, f853;
mul.f32 f2180, f613, f855;
sub.f32 f863, f2179, f2180;
mul.f32 f864, f612, f855;
fma.rn.f32 f865, f613, f853, f864;
mul.f32 f866, f591, f865;
fma.rn.f32 f867, f863, f590, f866;
mul.f32 f868, f590, f865;
mul.f32 f869, f863, f591;
sub.f32 f870, f869, f868;
mul.f32 f872, f613, f865;
mul.f32 f2178, f612, f863;
sub.f32 f873, f2178, f872;
mul.f32 f874, f612, f865;
fma.rn.f32 f875, f613, f863, f874;
mul.f32 f876, f595, f875;
fma.rn.f32 f877, f873, f594, f876;
mul.f32 f878, f594, f875;
mul.f32 f879, f873, f595;
sub.f32 f880, f879, f878;
mul.f32 f882, f613, f875;
mul.f32 f2177, f612, f873;
sub.f32 f883, f2177, f882;
mul.f32 f884, f612, f875;
fma.rn.f32 f885, f613, f873, f884;
mul.f32 f886, f599, f885;
fma.rn.f32 f887, f883, f598, f886;
mul.f32 f888, f598, f885;
mul.f32 f889, f883, f599;
sub.f32 f890, f889, f888;
mul.f32 f892, f613, f885;
mul.f32 f2176, f612, f883;
sub.f32 f893, f2176, f892;
mul.f32 f894, f612, f885;
fma.rn.f32 f895, f613, f883, f894;
mul.f32 f896, f603, f895;
fma.rn.f32 f897, f893, f602, f896;
mul.f32 f898, f602, f895;
mul.f32 f899, f893, f603;
sub.f32 f900, f899, f898;
mul.f32 f2174, f612, f893;
mul.f32 f2175, f613, f895;
sub.f32 f903, f2174, f2175;
mul.f32 f904, f612, f895;
fma.rn.f32 f905, f613, f893, f904;
mul.f32 f906, f607, f905;
fma.rn.f32 f907, f903, f606, f906;
mul.f32 f908, f606, f905;
mul.f32 f909, f903, f607;
sub.f32 f910, f909, f908;
mul.f32 f2172, f612, f903;
mul.f32 f2173, f613, f905;
sub.f32 f913, f2172, f2173;
mov.u32 r32, %tid.x;
mul.f32 f914, f612, f905;
fma.rn.f32 f915, f613, f903, f914;
mul.f32 f916, f611, f915;
fma.rn.f32 f917, f913, f610, f916;
mul.f32 f918, f610, f915;
mul.f32 f919, f913, f611;
sub.f32 f920, f919, f918;
shl.b32 r8, r32, 7;
and.b32 r9, r8, -16384;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 16256;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f548, f617, f627, f637};
st.shared.v4.f32 [r12+16], {f647, f657, f667, f677};
st.shared.v4.f32 [r12+32], {f687, f697, f707, f717};
st.shared.v4.f32 [r12+48], {f727, f737, f747, f757};
st.shared.v4.f32 [r12+64], {f767, f777, f787, f797};
st.shared.v4.f32 [r12+80], {f807, f817, f827, f837};
st.shared.v4.f32 [r12+96], {f847, f857, f867, f877};
st.shared.v4.f32 [r12+112], {f887, f897, f907, f917};
barrier.sync 0;
and.b32 r23, r32, 127;
mad.lo.s32 r13, r23, -124, r12;
ld.shared.f32 f921, [r13];
ld.shared.f32 f922, [r13+512];
ld.shared.f32 f923, [r13+1024];
ld.shared.f32 f924, [r13+1536];
ld.shared.f32 f925, [r13+2048];
ld.shared.f32 f926, [r13+2560];
ld.shared.f32 f927, [r13+3072];
ld.shared.f32 f928, [r13+3584];
ld.shared.f32 f929, [r13+4096];
ld.shared.f32 f930, [r13+4608];
ld.shared.f32 f931, [r13+5120];
ld.shared.f32 f932, [r13+5632];
ld.shared.f32 f933, [r13+6144];
ld.shared.f32 f934, [r13+6656];
ld.shared.f32 f935, [r13+7168];
ld.shared.f32 f936, [r13+7680];
ld.shared.f32 f937, [r13+8192];
ld.shared.f32 f938, [r13+8704];
ld.shared.f32 f939, [r13+9216];
ld.shared.f32 f940, [r13+9728];
ld.shared.f32 f941, [r13+10240];
ld.shared.f32 f942, [r13+10752];
ld.shared.f32 f943, [r13+11264];
ld.shared.f32 f944, [r13+11776];
ld.shared.f32 f945, [r13+12288];
ld.shared.f32 f946, [r13+12800];
ld.shared.f32 f947, [r13+13312];
ld.shared.f32 f948, [r13+13824];
ld.shared.f32 f949, [r13+14336];
ld.shared.f32 f950, [r13+14848];
ld.shared.f32 f951, [r13+15360];
ld.shared.f32 f952, [r13+15872];
barrier.sync 0;
st.shared.v4.f32 [r12], {f2231, f620, f630, f640};
st.shared.v4.f32 [r12+16], {f650, f660, f670, f680};
st.shared.v4.f32 [r12+32], {f690, f700, f710, f720};
st.shared.v4.f32 [r12+48], {f730, f740, f750, f760};
st.shared.v4.f32 [r12+64], {f770, f780, f790, f800};
st.shared.v4.f32 [r12+80], {f810, f820, f830, f840};
st.shared.v4.f32 [r12+96], {f850, f860, f870, f880};
st.shared.v4.f32 [r12+112], {f890, f900, f910, f920};
barrier.sync 0;
ld.shared.f32 f953, [r13];
ld.shared.f32 f954, [r13+512];
ld.shared.f32 f955, [r13+1024];
ld.shared.f32 f956, [r13+1536];
ld.shared.f32 f957, [r13+2048];
ld.shared.f32 f958, [r13+2560];
ld.shared.f32 f959, [r13+3072];
ld.shared.f32 f960, [r13+3584];
ld.shared.f32 f961, [r13+4096];
ld.shared.f32 f962, [r13+4608];
ld.shared.f32 f963, [r13+5120];
ld.shared.f32 f964, [r13+5632];
ld.shared.f32 f965, [r13+6144];
ld.shared.f32 f966, [r13+6656];
ld.shared.f32 f967, [r13+7168];
ld.shared.f32 f968, [r13+7680];
ld.shared.f32 f969, [r13+8192];
ld.shared.f32 f970, [r13+8704];
ld.shared.f32 f971, [r13+9216];
ld.shared.f32 f972, [r13+9728];
ld.shared.f32 f973, [r13+10240];
ld.shared.f32 f974, [r13+10752];
ld.shared.f32 f975, [r13+11264];
ld.shared.f32 f976, [r13+11776];
ld.shared.f32 f977, [r13+12288];
ld.shared.f32 f978, [r13+12800];
ld.shared.f32 f979, [r13+13312];
ld.shared.f32 f980, [r13+13824];
ld.shared.f32 f981, [r13+14336];
ld.shared.f32 f982, [r13+14848];
ld.shared.f32 f983, [r13+15360];
ld.shared.f32 f984, [r13+15872];
add.f32 f985, f921, f937;
sub.f32 f987, f921, f937;
add.f32 f2171, f953, f969;
sub.f32 f988, f953, f969;
add.f32 f989, f929, f945;
sub.f32 f991, f929, f945;
add.f32 f2170, f961, f977;
sub.f32 f992, f961, f977;
add.f32 f993, f985, f989;
sub.f32 f995, f985, f989;
add.f32 f2169, f2171, f2170;
sub.f32 f996, f2171, f2170;
sub.f32 f997, f987, f992;
add.f32 f999, f987, f992;
add.f32 f2168, f988, f991;
sub.f32 f1000, f988, f991;
add.f32 f1001, f925, f941;
sub.f32 f1003, f925, f941;
add.f32 f2167, f957, f973;
sub.f32 f1004, f957, f973;
add.f32 f1005, f933, f949;
sub.f32 f1007, f933, f949;
add.f32 f2166, f965, f981;
sub.f32 f1008, f965, f981;
add.f32 f1009, f1001, f1005;
sub.f32 f1011, f1001, f1005;
add.f32 f2165, f2167, f2166;
sub.f32 f1012, f2167, f2166;
sub.f32 f1013, f1003, f1008;
add.f32 f1015, f1003, f1008;
add.f32 f2164, f1004, f1007;
sub.f32 f1016, f1004, f1007;
mul.f32 f1017, f1013, 0f3F3504F3;
mul.f32 f1018, f2164, 0f3F3504F3;
sub.f32 f1019, f1017, f1018;
add.f32 f1020, f1017, f1018;
mul.f32 f2162, f1015, 0fBF3504F3;
mul.f32 f2163, f1016, 0f3F3504F3;
sub.f32 f1023, f2162, f2163;
mul.f32 f1024, f1016, 0fBF3504F3;
fma.rn.f32 f1025, f1015, 0f3F3504F3, f1024;
add.f32 f1026, f993, f1009;
sub.f32 f1028, f993, f1009;
add.f32 f2161, f2169, f2165;
sub.f32 f1029, f2169, f2165;
add.f32 f1030, f997, f1019;
sub.f32 f1032, f997, f1019;
add.f32 f2160, f2168, f1020;
sub.f32 f1033, f2168, f1020;
sub.f32 f1034, f995, f1012;
add.f32 f1036, f995, f1012;
add.f32 f2159, f996, f1011;
sub.f32 f1037, f996, f1011;
add.f32 f1038, f999, f1023;
sub.f32 f1040, f999, f1023;
add.f32 f2158, f1000, f1025;
sub.f32 f1041, f1000, f1025;
add.f32 f1042, f923, f939;
sub.f32 f1044, f923, f939;
add.f32 f2157, f955, f971;
sub.f32 f1045, f955, f971;
add.f32 f1046, f931, f947;
sub.f32 f1048, f931, f947;
add.f32 f2156, f963, f979;
sub.f32 f1049, f963, f979;
add.f32 f1050, f1042, f1046;
sub.f32 f1052, f1042, f1046;
add.f32 f2155, f2157, f2156;
sub.f32 f1053, f2157, f2156;
sub.f32 f1054, f1044, f1049;
add.f32 f1056, f1044, f1049;
add.f32 f2154, f1045, f1048;
sub.f32 f1057, f1045, f1048;
add.f32 f1058, f927, f943;
sub.f32 f1060, f927, f943;
add.f32 f2153, f959, f975;
sub.f32 f1061, f959, f975;
add.f32 f1062, f935, f951;
sub.f32 f1064, f935, f951;
add.f32 f2152, f967, f983;
sub.f32 f1065, f967, f983;
add.f32 f1066, f1058, f1062;
sub.f32 f1068, f1058, f1062;
add.f32 f2151, f2153, f2152;
sub.f32 f1069, f2153, f2152;
sub.f32 f1070, f1060, f1065;
add.f32 f1072, f1060, f1065;
add.f32 f2150, f1061, f1064;
sub.f32 f1073, f1061, f1064;
mul.f32 f1074, f1070, 0f3F3504F3;
mul.f32 f1075, f2150, 0f3F3504F3;
sub.f32 f1076, f1074, f1075;
add.f32 f1077, f1074, f1075;
mul.f32 f2148, f1072, 0fBF3504F3;
mul.f32 f2149, f1073, 0f3F3504F3;
sub.f32 f1080, f2148, f2149;
mul.f32 f1081, f1073, 0fBF3504F3;
fma.rn.f32 f1082, f1072, 0f3F3504F3, f1081;
add.f32 f1083, f1050, f1066;
sub.f32 f1085, f1050, f1066;
add.f32 f2147, f2155, f2151;
sub.f32 f1086, f2155, f2151;
add.f32 f1087, f1054, f1076;
sub.f32 f1089, f1054, f1076;
add.f32 f2146, f2154, f1077;
sub.f32 f1090, f2154, f1077;
sub.f32 f1091, f1052, f1069;
add.f32 f1093, f1052, f1069;
add.f32 f2145, f1053, f1068;
sub.f32 f1094, f1053, f1068;
add.f32 f1095, f1056, f1080;
sub.f32 f1097, f1056, f1080;
add.f32 f2144, f1057, f1082;
sub.f32 f1098, f1057, f1082;
mul.f32 f2142, f1087, 0f3F6C835E;
mul.f32 f2143, f2146, 0f3EC3EF15;
sub.f32 f1101, f2142, f2143;
mul.f32 f1102, f2146, 0f3F6C835E;
fma.rn.f32 f1103, f1087, 0f3EC3EF15, f1102;
mul.f32 f1104, f1091, 0f3F3504F3;
mul.f32 f1105, f2145, 0f3F3504F3;
sub.f32 f1106, f1104, f1105;
add.f32 f1107, f1104, f1105;
mul.f32 f2140, f1095, 0f3EC3EF15;
mul.f32 f2141, f2144, 0f3F6C835E;
sub.f32 f1110, f2140, f2141;
mul.f32 f1111, f2144, 0f3EC3EF15;
fma.rn.f32 f1112, f1095, 0f3F6C835E, f1111;
mul.f32 f2138, f1089, 0fBEC3EF15;
mul.f32 f2139, f1090, 0f3F6C835E;
sub.f32 f1115, f2138, f2139;
mul.f32 f1116, f1090, 0fBEC3EF15;
fma.rn.f32 f1117, f1089, 0f3F6C835E, f1116;
mul.f32 f2136, f1093, 0fBF3504F3;
mul.f32 f2137, f1094, 0f3F3504F3;
sub.f32 f1120, f2136, f2137;
mul.f32 f1121, f1094, 0fBF3504F3;
fma.rn.f32 f1122, f1093, 0f3F3504F3, f1121;
mul.f32 f2134, f1097, 0fBF6C835E;
mul.f32 f2135, f1098, 0f3EC3EF15;
sub.f32 f1125, f2134, f2135;
mul.f32 f1126, f1098, 0fBF6C835E;
fma.rn.f32 f1127, f1097, 0f3EC3EF15, f1126;
add.f32 f1128, f1026, f1083;
sub.f32 f1130, f1026, f1083;
add.f32 f2133, f2161, f2147;
sub.f32 f1131, f2161, f2147;
add.f32 f1132, f1030, f1101;
sub.f32 f1134, f1030, f1101;
add.f32 f2132, f2160, f1103;
sub.f32 f1135, f2160, f1103;
add.f32 f1136, f1034, f1106;
sub.f32 f1138, f1034, f1106;
add.f32 f2131, f2159, f1107;
sub.f32 f1139, f2159, f1107;
add.f32 f1140, f1038, f1110;
sub.f32 f1142, f1038, f1110;
add.f32 f2130, f2158, f1112;
sub.f32 f1143, f2158, f1112;
sub.f32 f1144, f1028, f1086;
add.f32 f1146, f1028, f1086;
add.f32 f2129, f1029, f1085;
sub.f32 f1147, f1029, f1085;
add.f32 f1148, f1032, f1115;
sub.f32 f1150, f1032, f1115;
add.f32 f2128, f1033, f1117;
sub.f32 f1151, f1033, f1117;
add.f32 f1152, f1036, f1120;
sub.f32 f1154, f1036, f1120;
add.f32 f2127, f1037, f1122;
sub.f32 f1155, f1037, f1122;
add.f32 f1156, f1040, f1125;
sub.f32 f1158, f1040, f1125;
add.f32 f2126, f1041, f1127;
sub.f32 f1159, f1041, f1127;
add.f32 f1160, f922, f938;
sub.f32 f1162, f922, f938;
add.f32 f2125, f954, f970;
sub.f32 f1163, f954, f970;
add.f32 f1164, f930, f946;
sub.f32 f1166, f930, f946;
add.f32 f2124, f962, f978;
sub.f32 f1167, f962, f978;
add.f32 f1168, f1160, f1164;
sub.f32 f1170, f1160, f1164;
add.f32 f2123, f2125, f2124;
sub.f32 f1171, f2125, f2124;
sub.f32 f1172, f1162, f1167;
add.f32 f1174, f1162, f1167;
add.f32 f2122, f1163, f1166;
sub.f32 f1175, f1163, f1166;
add.f32 f1176, f926, f942;
sub.f32 f1178, f926, f942;
add.f32 f2121, f958, f974;
sub.f32 f1179, f958, f974;
add.f32 f1180, f934, f950;
sub.f32 f1182, f934, f950;
add.f32 f2120, f966, f982;
sub.f32 f1183, f966, f982;
add.f32 f1184, f1176, f1180;
sub.f32 f1186, f1176, f1180;
add.f32 f2119, f2121, f2120;
sub.f32 f1187, f2121, f2120;
sub.f32 f1188, f1178, f1183;
add.f32 f1190, f1178, f1183;
add.f32 f2118, f1179, f1182;
sub.f32 f1191, f1179, f1182;
mul.f32 f1192, f1188, 0f3F3504F3;
mul.f32 f1193, f2118, 0f3F3504F3;
sub.f32 f1194, f1192, f1193;
add.f32 f1195, f1192, f1193;
mul.f32 f2116, f1190, 0fBF3504F3;
mul.f32 f2117, f1191, 0f3F3504F3;
sub.f32 f1198, f2116, f2117;
mul.f32 f1199, f1191, 0fBF3504F3;
fma.rn.f32 f1200, f1190, 0f3F3504F3, f1199;
add.f32 f1201, f1168, f1184;
sub.f32 f1203, f1168, f1184;
add.f32 f2115, f2123, f2119;
sub.f32 f1204, f2123, f2119;
add.f32 f1205, f1172, f1194;
sub.f32 f1207, f1172, f1194;
add.f32 f2114, f2122, f1195;
sub.f32 f1208, f2122, f1195;
sub.f32 f1209, f1170, f1187;
add.f32 f1211, f1170, f1187;
add.f32 f2113, f1171, f1186;
sub.f32 f1212, f1171, f1186;
add.f32 f1213, f1174, f1198;
sub.f32 f1215, f1174, f1198;
add.f32 f2112, f1175, f1200;
sub.f32 f1216, f1175, f1200;
add.f32 f1217, f924, f940;
sub.f32 f1219, f924, f940;
add.f32 f2111, f956, f972;
sub.f32 f1220, f956, f972;
add.f32 f1221, f932, f948;
sub.f32 f1223, f932, f948;
add.f32 f2110, f964, f980;
sub.f32 f1224, f964, f980;
add.f32 f1225, f1217, f1221;
sub.f32 f1227, f1217, f1221;
add.f32 f2109, f2111, f2110;
sub.f32 f1228, f2111, f2110;
sub.f32 f1229, f1219, f1224;
add.f32 f1231, f1219, f1224;
add.f32 f2108, f1220, f1223;
sub.f32 f1232, f1220, f1223;
add.f32 f1233, f928, f944;
sub.f32 f1235, f928, f944;
add.f32 f2107, f960, f976;
sub.f32 f1236, f960, f976;
add.f32 f1237, f936, f952;
sub.f32 f1239, f936, f952;
add.f32 f2106, f968, f984;
sub.f32 f1240, f968, f984;
add.f32 f1241, f1233, f1237;
sub.f32 f1243, f1233, f1237;
add.f32 f2105, f2107, f2106;
sub.f32 f1244, f2107, f2106;
sub.f32 f1245, f1235, f1240;
add.f32 f1247, f1235, f1240;
add.f32 f2104, f1236, f1239;
sub.f32 f1248, f1236, f1239;
mul.f32 f1249, f1245, 0f3F3504F3;
mul.f32 f1250, f2104, 0f3F3504F3;
sub.f32 f1251, f1249, f1250;
add.f32 f1252, f1249, f1250;
mul.f32 f2102, f1247, 0fBF3504F3;
mul.f32 f2103, f1248, 0f3F3504F3;
sub.f32 f1255, f2102, f2103;
mul.f32 f1256, f1248, 0fBF3504F3;
fma.rn.f32 f1257, f1247, 0f3F3504F3, f1256;
add.f32 f1258, f1225, f1241;
sub.f32 f1260, f1225, f1241;
add.f32 f2101, f2109, f2105;
sub.f32 f1261, f2109, f2105;
add.f32 f1262, f1229, f1251;
sub.f32 f1264, f1229, f1251;
add.f32 f2100, f2108, f1252;
sub.f32 f1265, f2108, f1252;
sub.f32 f1266, f1227, f1244;
add.f32 f1268, f1227, f1244;
add.f32 f2099, f1228, f1243;
sub.f32 f1269, f1228, f1243;
add.f32 f1270, f1231, f1255;
sub.f32 f1272, f1231, f1255;
add.f32 f2098, f1232, f1257;
sub.f32 f1273, f1232, f1257;
mul.f32 f2096, f1262, 0f3F6C835E;
mul.f32 f2097, f2100, 0f3EC3EF15;
sub.f32 f1276, f2096, f2097;
mul.f32 f1277, f2100, 0f3F6C835E;
fma.rn.f32 f1278, f1262, 0f3EC3EF15, f1277;
mul.f32 f1279, f1266, 0f3F3504F3;
mul.f32 f1280, f2099, 0f3F3504F3;
sub.f32 f1281, f1279, f1280;
add.f32 f1282, f1279, f1280;
mul.f32 f1284, f2098, 0f3F6C835E;
mul.f32 f2095, f1270, 0f3EC3EF15;
sub.f32 f1285, f2095, f1284;
mul.f32 f1286, f2098, 0f3EC3EF15;
fma.rn.f32 f1287, f1270, 0f3F6C835E, f1286;
mul.f32 f1289, f1265, 0f3F6C835E;
mul.f32 f2094, f1264, 0fBEC3EF15;
sub.f32 f1290, f2094, f1289;
mul.f32 f1291, f1265, 0fBEC3EF15;
fma.rn.f32 f1292, f1264, 0f3F6C835E, f1291;
mul.f32 f2092, f1268, 0fBF3504F3;
mul.f32 f2093, f1269, 0f3F3504F3;
sub.f32 f1295, f2092, f2093;
mul.f32 f1296, f1269, 0fBF3504F3;
fma.rn.f32 f1297, f1268, 0f3F3504F3, f1296;
mul.f32 f2090, f1272, 0fBF6C835E;
mul.f32 f2091, f1273, 0f3EC3EF15;
sub.f32 f1300, f2090, f2091;
mul.f32 f1301, f1273, 0fBF6C835E;
fma.rn.f32 f1302, f1272, 0f3EC3EF15, f1301;
add.f32 f1303, f1201, f1258;
sub.f32 f1305, f1201, f1258;
add.f32 f2089, f2115, f2101;
sub.f32 f1306, f2115, f2101;
add.f32 f1307, f1205, f1276;
sub.f32 f1309, f1205, f1276;
add.f32 f2088, f2114, f1278;
sub.f32 f1310, f2114, f1278;
add.f32 f1311, f1209, f1281;
sub.f32 f1313, f1209, f1281;
add.f32 f2087, f2113, f1282;
sub.f32 f1314, f2113, f1282;
add.f32 f1315, f1213, f1285;
sub.f32 f1317, f1213, f1285;
add.f32 f2086, f2112, f1287;
sub.f32 f1318, f2112, f1287;
sub.f32 f1319, f1203, f1261;
add.f32 f1321, f1203, f1261;
add.f32 f2085, f1204, f1260;
sub.f32 f1322, f1204, f1260;
add.f32 f1323, f1207, f1290;
sub.f32 f1325, f1207, f1290;
add.f32 f2084, f1208, f1292;
sub.f32 f1326, f1208, f1292;
add.f32 f1327, f1211, f1295;
sub.f32 f1329, f1211, f1295;
add.f32 f2083, f1212, f1297;
sub.f32 f1330, f1212, f1297;
add.f32 f1331, f1215, f1300;
sub.f32 f1333, f1215, f1300;
add.f32 f2082, f1216, f1302;
sub.f32 f1334, f1216, f1302;
mul.f32 f1336, f2088, 0f3E47C5C2;
mul.f32 f2081, f1307, 0f3F7B14BE;
sub.f32 f1337, f2081, f1336;
mul.f32 f1338, f2088, 0f3F7B14BE;
fma.rn.f32 f1339, f1307, 0f3E47C5C2, f1338;
mul.f32 f1341, f2087, 0f3EC3EF15;
mul.f32 f2080, f1311, 0f3F6C835E;
sub.f32 f1342, f2080, f1341;
mul.f32 f1343, f2087, 0f3F6C835E;
fma.rn.f32 f1344, f1311, 0f3EC3EF15, f1343;
mul.f32 f2078, f1315, 0f3F54DB31;
mul.f32 f2079, f2086, 0f3F0E39DA;
sub.f32 f1347, f2078, f2079;
mul.f32 f1348, f2086, 0f3F54DB31;
fma.rn.f32 f1349, f1315, 0f3F0E39DA, f1348;
mul.f32 f1350, f1319, 0f3F3504F3;
mul.f32 f1351, f2085, 0f3F3504F3;
sub.f32 f1352, f1350, f1351;
add.f32 f1353, f1350, f1351;
mul.f32 f1355, f2084, 0f3F54DB31;
mul.f32 f2077, f1323, 0f3F0E39DA;
sub.f32 f1356, f2077, f1355;
mul.f32 f1357, f2084, 0f3F0E39DA;
fma.rn.f32 f1358, f1323, 0f3F54DB31, f1357;
mul.f32 f1360, f2083, 0f3F6C835E;
mul.f32 f2076, f1327, 0f3EC3EF15;
sub.f32 f1361, f2076, f1360;
mul.f32 f1362, f2083, 0f3EC3EF15;
fma.rn.f32 f1363, f1327, 0f3F6C835E, f1362;
mul.f32 f2074, f1331, 0f3E47C5C2;
mul.f32 f2075, f2082, 0f3F7B14BE;
sub.f32 f1366, f2074, f2075;
mul.f32 f1367, f2082, 0f3E47C5C2;
fma.rn.f32 f1368, f1331, 0f3F7B14BE, f1367;
mul.f32 f2072, f1309, 0fBE47C5C2;
mul.f32 f2073, f1310, 0f3F7B14BE;
sub.f32 f1371, f2072, f2073;
mul.f32 f1372, f1310, 0fBE47C5C2;
fma.rn.f32 f1373, f1309, 0f3F7B14BE, f1372;
mul.f32 f2070, f1313, 0fBEC3EF15;
mul.f32 f2071, f1314, 0f3F6C835E;
sub.f32 f1376, f2070, f2071;
mul.f32 f1377, f1314, 0fBEC3EF15;
fma.rn.f32 f1378, f1313, 0f3F6C835E, f1377;
mul.f32 f2068, f1317, 0fBF0E39DA;
mul.f32 f2069, f1318, 0f3F54DB31;
sub.f32 f1381, f2068, f2069;
mul.f32 f1382, f1318, 0fBF0E39DA;
fma.rn.f32 f1383, f1317, 0f3F54DB31, f1382;
mul.f32 f1385, f1322, 0f3F3504F3;
mul.f32 f2067, f1321, 0fBF3504F3;
sub.f32 f1386, f2067, f1385;
mul.f32 f1387, f1322, 0fBF3504F3;
fma.rn.f32 f1388, f1321, 0f3F3504F3, f1387;
mul.f32 f1390, f1326, 0f3F0E39DA;
mul.f32 f2066, f1325, 0fBF54DB31;
sub.f32 f1391, f2066, f1390;
mul.f32 f1392, f1326, 0fBF54DB31;
fma.rn.f32 f1393, f1325, 0f3F0E39DA, f1392;
mul.f32 f1395, f1330, 0f3EC3EF15;
mul.f32 f2065, f1329, 0fBF6C835E;
sub.f32 f1396, f2065, f1395;
mul.f32 f1397, f1330, 0fBF6C835E;
fma.rn.f32 f1398, f1329, 0f3EC3EF15, f1397;
mul.f32 f1400, f1334, 0f3E47C5C2;
mul.f32 f2064, f1333, 0fBF7B14BE;
sub.f32 f1401, f2064, f1400;
mul.f32 f1402, f1334, 0fBF7B14BE;
fma.rn.f32 f1403, f1333, 0f3E47C5C2, f1402;
add.f32 f1404, f1128, f1303;
sub.f32 f1406, f1128, f1303;
add.f32 f2063, f2133, f2089;
sub.f32 f1407, f2133, f2089;
add.f32 f1408, f1132, f1337;
sub.f32 f1410, f1132, f1337;
add.f32 f2062, f2132, f1339;
sub.f32 f1411, f2132, f1339;
add.f32 f1412, f1136, f1342;
sub.f32 f1414, f1136, f1342;
add.f32 f2061, f2131, f1344;
sub.f32 f1415, f2131, f1344;
add.f32 f1416, f1140, f1347;
sub.f32 f1418, f1140, f1347;
add.f32 f2060, f2130, f1349;
sub.f32 f1419, f2130, f1349;
add.f32 f1420, f1144, f1352;
sub.f32 f1422, f1144, f1352;
add.f32 f2059, f2129, f1353;
sub.f32 f1423, f2129, f1353;
add.f32 f1424, f1148, f1356;
sub.f32 f1426, f1148, f1356;
add.f32 f2058, f2128, f1358;
sub.f32 f1427, f2128, f1358;
add.f32 f1428, f1152, f1361;
sub.f32 f1430, f1152, f1361;
add.f32 f2057, f2127, f1363;
sub.f32 f1431, f2127, f1363;
add.f32 f1432, f1156, f1366;
sub.f32 f1434, f1156, f1366;
add.f32 f2056, f2126, f1368;
sub.f32 f1435, f2126, f1368;
sub.f32 f1436, f1130, f1306;
add.f32 f1438, f1130, f1306;
add.f32 f2055, f1131, f1305;
sub.f32 f1439, f1131, f1305;
add.f32 f1440, f1134, f1371;
sub.f32 f1442, f1134, f1371;
add.f32 f2054, f1135, f1373;
sub.f32 f1443, f1135, f1373;
add.f32 f1444, f1138, f1376;
sub.f32 f1446, f1138, f1376;
add.f32 f2053, f1139, f1378;
sub.f32 f1447, f1139, f1378;
add.f32 f1448, f1142, f1381;
sub.f32 f1450, f1142, f1381;
add.f32 f2052, f1143, f1383;
sub.f32 f1451, f1143, f1383;
add.f32 f1452, f1146, f1386;
sub.f32 f1454, f1146, f1386;
add.f32 f2051, f1147, f1388;
sub.f32 f1455, f1147, f1388;
add.f32 f1456, f1150, f1391;
sub.f32 f1458, f1150, f1391;
add.f32 f2050, f1151, f1393;
sub.f32 f1459, f1151, f1393;
add.f32 f1460, f1154, f1396;
sub.f32 f1462, f1154, f1396;
add.f32 f2049, f1155, f1398;
sub.f32 f1463, f1155, f1398;
add.f32 f1464, f1158, f1401;
sub.f32 f1466, f1158, f1401;
add.f32 f2048, f1159, f1403;
sub.f32 f1467, f1159, f1403;
bfe.u32 r15, r32, 5, 2;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %66;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f1468, f1469}, [rd8];
mul.f32 f1472, f2062, f1469;
fma.rn.f32 f1473, f1468, f1408, f1472;
mul.f32 f1474, f1408, f1469;
mul.f32 f1475, f1468, f2062;
sub.f32 f1476, f1475, f1474;
mul.f32 f1478, f1469, f1469;
mul.f32 f2047, f1468, f1468;
sub.f32 f1479, f2047, f1478;
mul.f32 f1480, f1469, f1468;
fma.rn.f32 f1481, f1469, f1468, f1480;
mul.f32 f1482, f2061, f1481;
fma.rn.f32 f1483, f1479, f1412, f1482;
mul.f32 f1484, f1412, f1481;
mul.f32 f1485, f1479, f2061;
sub.f32 f1486, f1485, f1484;
mul.f32 f2045, f1468, f1479;
mul.f32 f2046, f1469, f1481;
sub.f32 f1489, f2045, f2046;
mul.f32 f1490, f1468, f1481;
fma.rn.f32 f1491, f1469, f1479, f1490;
mul.f32 f1492, f2060, f1491;
fma.rn.f32 f1493, f1489, f1416, f1492;
mul.f32 f1494, f1416, f1491;
mul.f32 f1495, f1489, f2060;
sub.f32 f1496, f1495, f1494;
mul.f32 f2043, f1468, f1489;
mul.f32 f2044, f1469, f1491;
sub.f32 f1499, f2043, f2044;
mul.f32 f1500, f1468, f1491;
fma.rn.f32 f1501, f1469, f1489, f1500;
mul.f32 f1502, f2059, f1501;
fma.rn.f32 f1503, f1499, f1420, f1502;
mul.f32 f1504, f1420, f1501;
mul.f32 f1505, f1499, f2059;
sub.f32 f1506, f1505, f1504;
mul.f32 f1508, f1469, f1501;
mul.f32 f2042, f1468, f1499;
sub.f32 f1509, f2042, f1508;
mul.f32 f1510, f1468, f1501;
fma.rn.f32 f1511, f1469, f1499, f1510;
mul.f32 f1512, f2058, f1511;
fma.rn.f32 f1513, f1509, f1424, f1512;
mul.f32 f1514, f1424, f1511;
mul.f32 f1515, f1509, f2058;
sub.f32 f1516, f1515, f1514;
mul.f32 f1518, f1469, f1511;
mul.f32 f2041, f1468, f1509;
sub.f32 f1519, f2041, f1518;
mul.f32 f1520, f1468, f1511;
fma.rn.f32 f1521, f1469, f1509, f1520;
mul.f32 f1522, f2057, f1521;
fma.rn.f32 f1523, f1519, f1428, f1522;
mul.f32 f1524, f1428, f1521;
mul.f32 f1525, f1519, f2057;
sub.f32 f1526, f1525, f1524;
mul.f32 f1528, f1469, f1521;
mul.f32 f2040, f1468, f1519;
sub.f32 f1529, f2040, f1528;
mul.f32 f1530, f1468, f1521;
fma.rn.f32 f1531, f1469, f1519, f1530;
mul.f32 f1532, f2056, f1531;
fma.rn.f32 f1533, f1529, f1432, f1532;
mul.f32 f1534, f1432, f1531;
mul.f32 f1535, f1529, f2056;
sub.f32 f1536, f1535, f1534;
mul.f32 f2038, f1468, f1529;
mul.f32 f2039, f1469, f1531;
sub.f32 f1539, f2038, f2039;
mul.f32 f1540, f1468, f1531;
fma.rn.f32 f1541, f1469, f1529, f1540;
mul.f32 f1542, f2055, f1541;
fma.rn.f32 f1543, f1539, f1436, f1542;
mul.f32 f1544, f1436, f1541;
mul.f32 f1545, f1539, f2055;
sub.f32 f1546, f1545, f1544;
mul.f32 f2036, f1468, f1539;
mul.f32 f2037, f1469, f1541;
sub.f32 f1549, f2036, f2037;
mul.f32 f1550, f1468, f1541;
fma.rn.f32 f1551, f1469, f1539, f1550;
mul.f32 f1552, f2054, f1551;
fma.rn.f32 f1553, f1549, f1440, f1552;
mul.f32 f1554, f1440, f1551;
mul.f32 f1555, f1549, f2054;
sub.f32 f1556, f1555, f1554;
mul.f32 f1558, f1469, f1551;
mul.f32 f2035, f1468, f1549;
sub.f32 f1559, f2035, f1558;
mul.f32 f1560, f1468, f1551;
fma.rn.f32 f1561, f1469, f1549, f1560;
mul.f32 f1562, f2053, f1561;
fma.rn.f32 f1563, f1559, f1444, f1562;
mul.f32 f1564, f1444, f1561;
mul.f32 f1565, f1559, f2053;
sub.f32 f1566, f1565, f1564;
mul.f32 f1568, f1469, f1561;
mul.f32 f2034, f1468, f1559;
sub.f32 f1569, f2034, f1568;
mul.f32 f1570, f1468, f1561;
fma.rn.f32 f1571, f1469, f1559, f1570;
mul.f32 f1572, f2052, f1571;
fma.rn.f32 f1573, f1569, f1448, f1572;
mul.f32 f1574, f1448, f1571;
mul.f32 f1575, f1569, f2052;
sub.f32 f1576, f1575, f1574;
mul.f32 f1578, f1469, f1571;
mul.f32 f2033, f1468, f1569;
sub.f32 f1579, f2033, f1578;
mul.f32 f1580, f1468, f1571;
fma.rn.f32 f1581, f1469, f1569, f1580;
mul.f32 f1582, f2051, f1581;
fma.rn.f32 f1583, f1579, f1452, f1582;
mul.f32 f1584, f1452, f1581;
mul.f32 f1585, f1579, f2051;
sub.f32 f1586, f1585, f1584;
mul.f32 f2031, f1468, f1579;
mul.f32 f2032, f1469, f1581;
sub.f32 f1589, f2031, f2032;
mul.f32 f1590, f1468, f1581;
fma.rn.f32 f1591, f1469, f1579, f1590;
mul.f32 f1592, f2050, f1591;
fma.rn.f32 f1593, f1589, f1456, f1592;
mul.f32 f1594, f1456, f1591;
mul.f32 f1595, f1589, f2050;
sub.f32 f1596, f1595, f1594;
mul.f32 f1598, f1469, f1591;
mul.f32 f2030, f1468, f1589;
sub.f32 f1599, f2030, f1598;
mul.f32 f1600, f1468, f1591;
fma.rn.f32 f1601, f1469, f1589, f1600;
mul.f32 f1602, f2049, f1601;
fma.rn.f32 f1603, f1599, f1460, f1602;
mul.f32 f1604, f1460, f1601;
mul.f32 f1605, f1599, f2049;
sub.f32 f1606, f1605, f1604;
mul.f32 f1608, f1469, f1601;
mul.f32 f2029, f1468, f1599;
sub.f32 f1609, f2029, f1608;
mul.f32 f1610, f1468, f1601;
fma.rn.f32 f1611, f1469, f1599, f1610;
mul.f32 f1612, f2048, f1611;
fma.rn.f32 f1613, f1609, f1464, f1612;
mul.f32 f1614, f1464, f1611;
mul.f32 f1615, f1609, f2048;
sub.f32 f1616, f1615, f1614;
mul.f32 f1618, f1469, f1611;
mul.f32 f2028, f1468, f1609;
sub.f32 f1619, f2028, f1618;
mul.f32 f1620, f1468, f1611;
fma.rn.f32 f1621, f1469, f1609, f1620;
mul.f32 f1622, f1407, f1621;
fma.rn.f32 f1623, f1619, f1406, f1622;
mul.f32 f1624, f1406, f1621;
mul.f32 f1625, f1619, f1407;
sub.f32 f1626, f1625, f1624;
mul.f32 f2026, f1468, f1619;
mul.f32 f2027, f1469, f1621;
sub.f32 f1629, f2026, f2027;
mul.f32 f1630, f1468, f1621;
fma.rn.f32 f1631, f1469, f1619, f1630;
mul.f32 f1632, f1411, f1631;
fma.rn.f32 f1633, f1629, f1410, f1632;
mul.f32 f1634, f1410, f1631;
mul.f32 f1635, f1629, f1411;
sub.f32 f1636, f1635, f1634;
mul.f32 f2024, f1468, f1629;
mul.f32 f2025, f1469, f1631;
sub.f32 f1639, f2024, f2025;
mul.f32 f1640, f1468, f1631;
fma.rn.f32 f1641, f1469, f1629, f1640;
mul.f32 f1642, f1415, f1641;
fma.rn.f32 f1643, f1639, f1414, f1642;
mul.f32 f1644, f1414, f1641;
mul.f32 f1645, f1639, f1415;
sub.f32 f1646, f1645, f1644;
mul.f32 f1648, f1469, f1641;
mul.f32 f2023, f1468, f1639;
sub.f32 f1649, f2023, f1648;
mul.f32 f1650, f1468, f1641;
fma.rn.f32 f1651, f1469, f1639, f1650;
mul.f32 f1652, f1419, f1651;
fma.rn.f32 f1653, f1649, f1418, f1652;
mul.f32 f1654, f1418, f1651;
mul.f32 f1655, f1649, f1419;
sub.f32 f1656, f1655, f1654;
mul.f32 f1658, f1469, f1651;
mul.f32 f2022, f1468, f1649;
sub.f32 f1659, f2022, f1658;
mul.f32 f1660, f1468, f1651;
fma.rn.f32 f1661, f1469, f1649, f1660;
mul.f32 f1662, f1423, f1661;
fma.rn.f32 f1663, f1659, f1422, f1662;
mul.f32 f1664, f1422, f1661;
mul.f32 f1665, f1659, f1423;
sub.f32 f1666, f1665, f1664;
mul.f32 f1668, f1469, f1661;
mul.f32 f2021, f1468, f1659;
sub.f32 f1669, f2021, f1668;
mul.f32 f1670, f1468, f1661;
fma.rn.f32 f1671, f1469, f1659, f1670;
mul.f32 f1672, f1427, f1671;
fma.rn.f32 f1673, f1669, f1426, f1672;
mul.f32 f1674, f1426, f1671;
mul.f32 f1675, f1669, f1427;
sub.f32 f1676, f1675, f1674;
mul.f32 f2019, f1468, f1669;
mul.f32 f2020, f1469, f1671;
sub.f32 f1679, f2019, f2020;
mul.f32 f1680, f1468, f1671;
fma.rn.f32 f1681, f1469, f1669, f1680;
mul.f32 f1682, f1431, f1681;
fma.rn.f32 f1683, f1679, f1430, f1682;
mul.f32 f1684, f1430, f1681;
mul.f32 f1685, f1679, f1431;
sub.f32 f1686, f1685, f1684;
mul.f32 f2017, f1468, f1679;
mul.f32 f2018, f1469, f1681;
sub.f32 f1689, f2017, f2018;
mul.f32 f1690, f1468, f1681;
fma.rn.f32 f1691, f1469, f1679, f1690;
mul.f32 f1692, f1435, f1691;
fma.rn.f32 f1693, f1689, f1434, f1692;
mul.f32 f1694, f1434, f1691;
mul.f32 f1695, f1689, f1435;
sub.f32 f1696, f1695, f1694;
mul.f32 f1698, f1469, f1691;
mul.f32 f2016, f1468, f1689;
sub.f32 f1699, f2016, f1698;
mul.f32 f1700, f1468, f1691;
fma.rn.f32 f1701, f1469, f1689, f1700;
mul.f32 f1702, f1439, f1701;
fma.rn.f32 f1703, f1699, f1438, f1702;
mul.f32 f1704, f1438, f1701;
mul.f32 f1705, f1699, f1439;
sub.f32 f1706, f1705, f1704;
mul.f32 f1708, f1469, f1701;
mul.f32 f2015, f1468, f1699;
sub.f32 f1709, f2015, f1708;
mul.f32 f1710, f1468, f1701;
fma.rn.f32 f1711, f1469, f1699, f1710;
mul.f32 f1712, f1443, f1711;
fma.rn.f32 f1713, f1709, f1442, f1712;
mul.f32 f1714, f1442, f1711;
mul.f32 f1715, f1709, f1443;
sub.f32 f1716, f1715, f1714;
mul.f32 f2013, f1468, f1709;
mul.f32 f2014, f1469, f1711;
sub.f32 f1719, f2013, f2014;
mul.f32 f1720, f1468, f1711;
fma.rn.f32 f1721, f1469, f1709, f1720;
mul.f32 f1722, f1447, f1721;
fma.rn.f32 f1723, f1719, f1446, f1722;
mul.f32 f1724, f1446, f1721;
mul.f32 f1725, f1719, f1447;
sub.f32 f1726, f1725, f1724;
mul.f32 f2011, f1468, f1719;
mul.f32 f2012, f1469, f1721;
sub.f32 f1729, f2011, f2012;
mul.f32 f1730, f1468, f1721;
fma.rn.f32 f1731, f1469, f1719, f1730;
mul.f32 f1732, f1451, f1731;
fma.rn.f32 f1733, f1729, f1450, f1732;
mul.f32 f1734, f1450, f1731;
mul.f32 f1735, f1729, f1451;
sub.f32 f1736, f1735, f1734;
mul.f32 f1738, f1469, f1731;
mul.f32 f2010, f1468, f1729;
sub.f32 f1739, f2010, f1738;
mul.f32 f1740, f1468, f1731;
fma.rn.f32 f1741, f1469, f1729, f1740;
mul.f32 f1742, f1455, f1741;
fma.rn.f32 f1743, f1739, f1454, f1742;
mul.f32 f1744, f1454, f1741;
mul.f32 f1745, f1739, f1455;
sub.f32 f1746, f1745, f1744;
mul.f32 f1748, f1469, f1741;
mul.f32 f2009, f1468, f1739;
sub.f32 f1749, f2009, f1748;
mul.f32 f1750, f1468, f1741;
fma.rn.f32 f1751, f1469, f1739, f1750;
mul.f32 f1752, f1459, f1751;
fma.rn.f32 f1753, f1749, f1458, f1752;
mul.f32 f1754, f1458, f1751;
mul.f32 f1755, f1749, f1459;
sub.f32 f1756, f1755, f1754;
mul.f32 f1758, f1469, f1751;
mul.f32 f2008, f1468, f1749;
sub.f32 f1759, f2008, f1758;
mul.f32 f1760, f1468, f1751;
fma.rn.f32 f1761, f1469, f1749, f1760;
mul.f32 f1762, f1463, f1761;
fma.rn.f32 f1763, f1759, f1462, f1762;
mul.f32 f1764, f1462, f1761;
mul.f32 f1765, f1759, f1463;
sub.f32 f1766, f1765, f1764;
mul.f32 f2006, f1468, f1759;
mul.f32 f2007, f1469, f1761;
sub.f32 f1769, f2006, f2007;
mov.u32 r28, %tid.x;
shl.b32 r27, r28, 7;
mul.f32 f1770, f1468, f1761;
mov.u32 r31, %tid.x;
shl.b32 r30, r31, 7;
fma.rn.f32 f1771, f1469, f1759, f1770;
mul.f32 f1772, f1467, f1771;
fma.rn.f32 f1773, f1769, f1466, f1772;
mul.f32 f1774, f1466, f1771;
mov.u32 r33, %tid.x;
mul.f32 f1775, f1769, f1467;
sub.f32 f1776, f1775, f1774;
and.b32 r22, r33, 96;
shl.b32 r16, r33, 2;
and.b32 r17, r16, 124;
add.s32 r18, r10, r17;
barrier.sync 0;
and.b32 r19, r30, 12288;
add.s32 r20, r18, r19;
st.shared.f32 [r20], f1404;
st.shared.f32 [r20+128], f1473;
st.shared.f32 [r20+256], f1483;
st.shared.f32 [r20+384], f1493;
st.shared.f32 [r20+512], f1503;
st.shared.f32 [r20+640], f1513;
st.shared.f32 [r20+768], f1523;
st.shared.f32 [r20+896], f1533;
st.shared.f32 [r20+1024], f1543;
st.shared.f32 [r20+1152], f1553;
st.shared.f32 [r20+1280], f1563;
st.shared.f32 [r20+1408], f1573;
st.shared.f32 [r20+1536], f1583;
st.shared.f32 [r20+1664], f1593;
st.shared.f32 [r20+1792], f1603;
st.shared.f32 [r20+1920], f1613;
st.shared.f32 [r20+2048], f1623;
st.shared.f32 [r20+2176], f1633;
st.shared.f32 [r20+2304], f1643;
st.shared.f32 [r20+2432], f1653;
st.shared.f32 [r20+2560], f1663;
st.shared.f32 [r20+2688], f1673;
st.shared.f32 [r20+2816], f1683;
st.shared.f32 [r20+2944], f1693;
st.shared.f32 [r20+3072], f1703;
st.shared.f32 [r20+3200], f1713;
st.shared.f32 [r20+3328], f1723;
st.shared.f32 [r20+3456], f1733;
st.shared.f32 [r20+3584], f1743;
st.shared.f32 [r20+3712], f1753;
st.shared.f32 [r20+3840], f1763;
st.shared.f32 [r20+3968], f1773;
barrier.sync 0;
mad.lo.s32 r21, r22, -124, r20;
ld.shared.f32 f1777, [r21];
ld.shared.f32 f1778, [r21+512];
ld.shared.f32 f1779, [r21+1024];
ld.shared.f32 f1780, [r21+1536];
ld.shared.f32 f1781, [r21+2048];
ld.shared.f32 f1782, [r21+2560];
ld.shared.f32 f1783, [r21+3072];
ld.shared.f32 f1784, [r21+3584];
ld.shared.f32 f1785, [r21+4096];
ld.shared.f32 f1786, [r21+4608];
ld.shared.f32 f1787, [r21+5120];
ld.shared.f32 f1788, [r21+5632];
ld.shared.f32 f1789, [r21+6144];
ld.shared.f32 f1790, [r21+6656];
ld.shared.f32 f1791, [r21+7168];
ld.shared.f32 f1792, [r21+7680];
ld.shared.f32 f1793, [r21+8192];
ld.shared.f32 f1794, [r21+8704];
ld.shared.f32 f1795, [r21+9216];
ld.shared.f32 f1796, [r21+9728];
ld.shared.f32 f1797, [r21+10240];
ld.shared.f32 f1798, [r21+10752];
ld.shared.f32 f1799, [r21+11264];
ld.shared.f32 f1800, [r21+11776];
ld.shared.f32 f1801, [r21+12288];
ld.shared.f32 f1802, [r21+12800];
ld.shared.f32 f1803, [r21+13312];
ld.shared.f32 f1804, [r21+13824];
ld.shared.f32 f1805, [r21+14336];
ld.shared.f32 f1806, [r21+14848];
ld.shared.f32 f1807, [r21+15360];
ld.shared.f32 f1808, [r21+15872];
barrier.sync 0;
st.shared.f32 [r20], f2063;
st.shared.f32 [r20+128], f1476;
st.shared.f32 [r20+256], f1486;
st.shared.f32 [r20+384], f1496;
st.shared.f32 [r20+512], f1506;
st.shared.f32 [r20+640], f1516;
st.shared.f32 [r20+768], f1526;
st.shared.f32 [r20+896], f1536;
st.shared.f32 [r20+1024], f1546;
st.shared.f32 [r20+1152], f1556;
st.shared.f32 [r20+1280], f1566;
st.shared.f32 [r20+1408], f1576;
st.shared.f32 [r20+1536], f1586;
st.shared.f32 [r20+1664], f1596;
st.shared.f32 [r20+1792], f1606;
st.shared.f32 [r20+1920], f1616;
st.shared.f32 [r20+2048], f1626;
st.shared.f32 [r20+2176], f1636;
st.shared.f32 [r20+2304], f1646;
st.shared.f32 [r20+2432], f1656;
st.shared.f32 [r20+2560], f1666;
st.shared.f32 [r20+2688], f1676;
st.shared.f32 [r20+2816], f1686;
st.shared.f32 [r20+2944], f1696;
st.shared.f32 [r20+3072], f1706;
st.shared.f32 [r20+3200], f1716;
st.shared.f32 [r20+3328], f1726;
st.shared.f32 [r20+3456], f1736;
st.shared.f32 [r20+3584], f1746;
st.shared.f32 [r20+3712], f1756;
st.shared.f32 [r20+3840], f1766;
st.shared.f32 [r20+3968], f1776;
barrier.sync 0;
ld.shared.f32 f1809, [r21];
ld.shared.f32 f1810, [r21+512];
ld.shared.f32 f1811, [r21+1024];
ld.shared.f32 f1812, [r21+1536];
ld.shared.f32 f1813, [r21+2048];
ld.shared.f32 f1814, [r21+2560];
ld.shared.f32 f1815, [r21+3072];
ld.shared.f32 f1816, [r21+3584];
ld.shared.f32 f1817, [r21+4096];
ld.shared.f32 f1818, [r21+4608];
ld.shared.f32 f1819, [r21+5120];
ld.shared.f32 f1820, [r21+5632];
ld.shared.f32 f1821, [r21+6144];
ld.shared.f32 f1822, [r21+6656];
ld.shared.f32 f1823, [r21+7168];
ld.shared.f32 f1824, [r21+7680];
ld.shared.f32 f1825, [r21+8192];
ld.shared.f32 f1826, [r21+8704];
ld.shared.f32 f1827, [r21+9216];
ld.shared.f32 f1828, [r21+9728];
ld.shared.f32 f1829, [r21+10240];
ld.shared.f32 f1830, [r21+10752];
ld.shared.f32 f1831, [r21+11264];
ld.shared.f32 f1832, [r21+11776];
ld.shared.f32 f1833, [r21+12288];
ld.shared.f32 f1834, [r21+12800];
ld.shared.f32 f1835, [r21+13312];
ld.shared.f32 f1836, [r21+13824];
ld.shared.f32 f1837, [r21+14336];
ld.shared.f32 f1838, [r21+14848];
ld.shared.f32 f1839, [r21+15360];
ld.shared.f32 f1840, [r21+15872];
add.f32 f1841, f1777, f1793;
sub.f32 f1843, f1777, f1793;
add.f32 f2005, f1809, f1825;
sub.f32 f1844, f1809, f1825;
add.f32 f1845, f1785, f1801;
sub.f32 f1847, f1785, f1801;
add.f32 f2004, f1817, f1833;
sub.f32 f1848, f1817, f1833;
add.f32 f1849, f1778, f1794;
sub.f32 f1851, f1778, f1794;
add.f32 f2003, f1810, f1826;
sub.f32 f1852, f1810, f1826;
add.f32 f1853, f1786, f1802;
sub.f32 f1855, f1786, f1802;
add.f32 f2002, f1818, f1834;
sub.f32 f1856, f1818, f1834;
add.f32 f1857, f1779, f1795;
sub.f32 f1859, f1779, f1795;
add.f32 f2001, f1811, f1827;
sub.f32 f1860, f1811, f1827;
add.f32 f1861, f1787, f1803;
sub.f32 f1863, f1787, f1803;
add.f32 f2000, f1819, f1835;
sub.f32 f1864, f1819, f1835;
add.f32 f1865, f1780, f1796;
sub.f32 f1867, f1780, f1796;
add.f32 f1999, f1812, f1828;
sub.f32 f1868, f1812, f1828;
add.f32 f1869, f1788, f1804;
sub.f32 f1871, f1788, f1804;
add.f32 f1998, f1820, f1836;
sub.f32 f1872, f1820, f1836;
add.f32 f1873, f1781, f1797;
sub.f32 f1875, f1781, f1797;
add.f32 f1997, f1813, f1829;
sub.f32 f1876, f1813, f1829;
add.f32 f1877, f1789, f1805;
sub.f32 f1879, f1789, f1805;
add.f32 f1996, f1821, f1837;
sub.f32 f1880, f1821, f1837;
add.f32 f1881, f1782, f1798;
sub.f32 f1883, f1782, f1798;
add.f32 f1995, f1814, f1830;
sub.f32 f1884, f1814, f1830;
add.f32 f1885, f1790, f1806;
sub.f32 f1887, f1790, f1806;
add.f32 f1994, f1822, f1838;
sub.f32 f1888, f1822, f1838;
add.f32 f1889, f1783, f1799;
sub.f32 f1891, f1783, f1799;
add.f32 f1993, f1815, f1831;
sub.f32 f1892, f1815, f1831;
add.f32 f1893, f1791, f1807;
sub.f32 f1895, f1791, f1807;
add.f32 f1992, f1823, f1839;
sub.f32 f1896, f1823, f1839;
add.f32 f1897, f1784, f1800;
sub.f32 f1899, f1784, f1800;
add.f32 f1991, f1816, f1832;
sub.f32 f1900, f1816, f1832;
add.f32 f1901, f1792, f1808;
sub.f32 f1903, f1792, f1808;
add.f32 f1990, f1824, f1840;
sub.f32 f1904, f1824, f1840;
add.f32 %1, f2005, f2004;
add.f32 %0, f1841, f1845;
add.f32 %3, f2003, f2002;
add.f32 %2, f1849, f1853;
add.f32 %5, f2001, f2000;
add.f32 %4, f1857, f1861;
add.f32 %7, f1999, f1998;
add.f32 %6, f1865, f1869;
add.f32 %8, f1873, f1877;
add.f32 %9, f1997, f1996;
add.f32 %10, f1881, f1885;
add.f32 %11, f1995, f1994;
add.f32 %13, f1993, f1992;
add.f32 %12, f1889, f1893;
add.f32 %15, f1991, f1990;
add.f32 %14, f1897, f1901;
sub.f32 %16, f1843, f1848;
add.f32 %17, f1844, f1847;
sub.f32 %18, f1851, f1856;
add.f32 %19, f1852, f1855;
add.f32 %21, f1860, f1863;
sub.f32 %20, f1859, f1864;
add.f32 %23, f1868, f1871;
sub.f32 %22, f1867, f1872;
add.f32 %25, f1876, f1879;
sub.f32 %24, f1875, f1880;
sub.f32 %26, f1883, f1888;
add.f32 %27, f1884, f1887;
sub.f32 %28, f1891, f1896;
add.f32 %29, f1892, f1895;
sub.f32 %30, f1899, f1904;
add.f32 %31, f1900, f1903;
sub.f32 %32, f1841, f1845;
sub.f32 %33, f2005, f2004;
sub.f32 %34, f1849, f1853;
sub.f32 %35, f2003, f2002;
sub.f32 %36, f1857, f1861;
sub.f32 %37, f2001, f2000;
sub.f32 %38, f1865, f1869;
sub.f32 %39, f1999, f1998;
sub.f32 %40, f1873, f1877;
sub.f32 %41, f1997, f1996;
sub.f32 %42, f1881, f1885;
sub.f32 %43, f1995, f1994;
sub.f32 %44, f1889, f1893;
sub.f32 %45, f1993, f1992;
sub.f32 %46, f1897, f1901;
sub.f32 %47, f1991, f1990;
sub.f32 %49, f1844, f1847;
add.f32 %48, f1843, f1848;
sub.f32 %51, f1852, f1855;
add.f32 %50, f1851, f1856;
sub.f32 %53, f1860, f1863;
add.f32 %52, f1859, f1864;
sub.f32 %55, f1868, f1871;
add.f32 %54, f1867, f1872;
sub.f32 %57, f1876, f1879;
add.f32 %56, f1875, f1880;
sub.f32 %59, f1884, f1887;
add.f32 %58, f1883, f1888;
sub.f32 %61, f1892, f1895;
add.f32 %60, f1891, f1896;
sub.f32 %63, f1900, f1903;
add.f32 %62, f1899, f1904;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_4096), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<309, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<338>;
.reg .b32 r<42>;
.reg .b64 rd<18>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 15;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f17, %14, %19;
add.f32 f18, %15, %21;
sub.f32 f19, %14, %19;
sub.f32 f20, %15, %21;
add.f32 f21, %16, %22;
add.f32 f22, %18, %23;
sub.f32 f23, %16, %22;
sub.f32 f24, %18, %23;
sub.f32 f25, f17, f21;
sub.f32 f26, f18, f22;
sub.f32 f27, f19, f24;
add.f32 f28, f20, f23;
add.f32 f29, f19, f24;
sub.f32 f30, f20, f23;
and.b32 r6, r5, 1023;
shl.b32 r7, r5, 5;
and.b32 r8, r7, -32768;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 8184;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f31, f32}, [rd5];
mul.f32 f35, f28, f32;
mul.f32 f36, f27, f32;
mul.f32 f37, f31, f28;
mul.f32 f38, f31, f31;
mul.f32 f39, f32, f32;
sub.f32 f40, f38, f39;
mul.f32 f41, f32, f31;
fma.rn.f32 f42, f32, f31, f41;
mul.f32 f43, f26, f42;
mul.f32 f44, f25, f42;
mul.f32 f45, f40, f26;
mul.f32 f46, f31, f40;
mul.f32 f47, f32, f42;
sub.f32 f48, f46, f47;
mul.f32 f49, f31, f42;
fma.rn.f32 f50, f32, f40, f49;
mul.f32 f51, f30, f50;
mul.f32 f52, f29, f50;
mul.f32 f53, f48, f30;
barrier.sync 0;
and.b32 r11, r7, 32736;
add.s32 r12, r9, r11;
add.f32 f54, f18, f22;
add.f32 f55, f17, f21;
fma.rn.f32 f56, f31, f27, f35;
sub.f32 f57, f37, f36;
st.shared.v4.f32 [r12], {f55, f54, f56, f57};
sub.f32 f58, f45, f44;
fma.rn.f32 f59, f40, f25, f43;
fma.rn.f32 f60, f48, f29, f51;
sub.f32 f61, f53, f52;
st.shared.v4.f32 [r12+16], {f59, f58, f60, f61};
barrier.sync 0;
mad.lo.s32 r13, r6, -24, r12;
ld.shared.v2.f32 {f62, f63}, [r13];
ld.shared.v2.f32 {f66, f67}, [r13+8192];
ld.shared.v2.f32 {f70, f71}, [r13+16384];
ld.shared.v2.f32 {f74, f75}, [r13+24576];
add.f32 f78, f62, f70;
add.f32 f79, f63, f71;
sub.f32 f80, f62, f70;
sub.f32 f81, f63, f71;
add.f32 f82, f66, f74;
add.f32 f83, f67, f75;
sub.f32 f84, f66, f74;
sub.f32 f85, f67, f75;
sub.f32 f86, f78, f82;
sub.f32 f87, f79, f83;
sub.f32 f88, f80, f85;
add.f32 f89, f81, f84;
add.f32 f90, f80, f85;
sub.f32 f91, f81, f84;
and.b32 r14, r5, 1020;
bfe.u32 r15, r5, 2, 8;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %10;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f92, f93}, [rd8];
mul.f32 f96, f89, f93;
mul.f32 f97, f88, f93;
mul.f32 f98, f92, f89;
mul.f32 f99, f92, f92;
mul.f32 f100, f93, f93;
sub.f32 f101, f99, f100;
mul.f32 f102, f93, f92;
fma.rn.f32 f103, f93, f92, f102;
mul.f32 f104, f87, f103;
mul.f32 f105, f86, f103;
mul.f32 f106, f101, f87;
mul.f32 f107, f92, f101;
mul.f32 f108, f93, f103;
sub.f32 f109, f107, f108;
mul.f32 f110, f92, f103;
fma.rn.f32 f111, f93, f101, f110;
mul.f32 f112, f91, f111;
mul.f32 f113, f90, f111;
mul.f32 f114, f109, f91;
and.b32 r16, r10, 24;
add.s32 r17, r9, r16;
barrier.sync 0;
and.b32 r18, r7, 32640;
add.s32 r19, r17, r18;
add.f32 f115, f79, f83;
add.f32 f116, f78, f82;
st.shared.v2.f32 [r19], {f116, f115};
fma.rn.f32 f117, f92, f88, f96;
sub.f32 f118, f98, f97;
st.shared.v2.f32 [r19+32], {f117, f118};
fma.rn.f32 f119, f101, f86, f104;
sub.f32 f120, f106, f105;
st.shared.v2.f32 [r19+64], {f119, f120};
sub.f32 f121, f114, f113;
fma.rn.f32 f122, f109, f90, f112;
st.shared.v2.f32 [r19+96], {f122, f121};
barrier.sync 0;
mad.lo.s32 r20, r14, -24, r19;
ld.shared.v2.f32 {f123, f124}, [r20];
ld.shared.v2.f32 {f127, f128}, [r20+8192];
ld.shared.v2.f32 {f131, f132}, [r20+16384];
ld.shared.v2.f32 {f135, f136}, [r20+24576];
add.f32 f139, f123, f131;
add.f32 f140, f124, f132;
sub.f32 f141, f123, f131;
sub.f32 f142, f124, f132;
add.f32 f143, f127, f135;
add.f32 f144, f128, f136;
sub.f32 f145, f127, f135;
sub.f32 f146, f128, f136;
sub.f32 f147, f139, f143;
sub.f32 f148, f140, f144;
sub.f32 f149, f141, f146;
add.f32 f150, f142, f145;
add.f32 f151, f141, f146;
sub.f32 f152, f142, f145;
and.b32 r21, r5, 1008;
bfe.u32 r22, r5, 4, 6;
mul.wide.u32 rd9, r22, 8;
mov.u64 rd10, %11;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f153, f154}, [rd11];
mul.f32 f157, f150, f154;
mul.f32 f158, f149, f154;
mul.f32 f159, f153, f150;
mul.f32 f160, f153, f153;
mul.f32 f161, f154, f154;
sub.f32 f162, f160, f161;
mul.f32 f163, f154, f153;
fma.rn.f32 f164, f154, f153, f163;
mul.f32 f165, f148, f164;
mul.f32 f166, f147, f164;
mul.f32 f167, f162, f148;
mul.f32 f168, f153, f162;
mul.f32 f169, f154, f164;
sub.f32 f170, f168, f169;
mul.f32 f171, f153, f164;
fma.rn.f32 f172, f154, f162, f171;
mul.f32 f173, f152, f172;
mul.f32 f174, f151, f172;
mul.f32 f175, f170, f152;
and.b32 r23, r10, 120;
add.s32 r24, r9, r23;
barrier.sync 0;
and.b32 r25, r7, 32256;
add.s32 r26, r24, r25;
add.f32 f176, f140, f144;
add.f32 f177, f139, f143;
st.shared.v2.f32 [r26], {f177, f176};
fma.rn.f32 f178, f153, f149, f157;
sub.f32 f179, f159, f158;
st.shared.v2.f32 [r26+128], {f178, f179};
fma.rn.f32 f180, f162, f147, f165;
sub.f32 f181, f167, f166;
st.shared.v2.f32 [r26+256], {f180, f181};
sub.f32 f182, f175, f174;
fma.rn.f32 f183, f170, f151, f173;
st.shared.v2.f32 [r26+384], {f183, f182};
barrier.sync 0;
mad.lo.s32 r27, r21, -24, r26;
ld.shared.v2.f32 {f184, f185}, [r27];
ld.shared.v2.f32 {f188, f189}, [r27+8192];
ld.shared.v2.f32 {f192, f193}, [r27+16384];
ld.shared.v2.f32 {f196, f197}, [r27+24576];
add.f32 f200, f184, f192;
add.f32 f201, f185, f193;
sub.f32 f202, f184, f192;
sub.f32 f203, f185, f193;
add.f32 f204, f188, f196;
add.f32 f205, f189, f197;
sub.f32 f206, f188, f196;
sub.f32 f207, f189, f197;
sub.f32 f208, f200, f204;
sub.f32 f209, f201, f205;
sub.f32 f210, f202, f207;
add.f32 f211, f203, f206;
add.f32 f212, f202, f207;
sub.f32 f213, f203, f206;
and.b32 r28, r5, 960;
bfe.u32 r29, r5, 6, 4;
mul.wide.u32 rd12, r29, 8;
mov.u64 rd13, %12;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f214, f215}, [rd14];
mul.f32 f218, f211, f215;
mul.f32 f219, f210, f215;
mul.f32 f220, f214, f211;
mul.f32 f221, f214, f214;
mul.f32 f222, f215, f215;
sub.f32 f223, f221, f222;
mul.f32 f224, f215, f214;
fma.rn.f32 f225, f215, f214, f224;
mul.f32 f226, f209, f225;
mul.f32 f227, f208, f225;
mul.f32 f228, f223, f209;
mul.f32 f229, f214, f223;
mul.f32 f230, f215, f225;
sub.f32 f231, f229, f230;
mul.f32 f232, f214, f225;
fma.rn.f32 f233, f215, f223, f232;
mul.f32 f234, f213, f233;
mul.f32 f235, f212, f233;
mul.f32 f236, f231, f213;
and.b32 r30, r10, 504;
add.s32 r31, r9, r30;
barrier.sync 0;
and.b32 r32, r7, 30720;
add.s32 r33, r31, r32;
add.f32 f237, f201, f205;
add.f32 f238, f200, f204;
st.shared.v2.f32 [r33], {f238, f237};
fma.rn.f32 f239, f214, f210, f218;
sub.f32 f240, f220, f219;
st.shared.v2.f32 [r33+512], {f239, f240};
fma.rn.f32 f241, f223, f208, f226;
sub.f32 f242, f228, f227;
st.shared.v2.f32 [r33+1024], {f241, f242};
sub.f32 f243, f236, f235;
fma.rn.f32 f244, f231, f212, f234;
st.shared.v2.f32 [r33+1536], {f244, f243};
barrier.sync 0;
mad.lo.s32 r34, r28, -24, r33;
ld.shared.v2.f32 {f245, f246}, [r34];
ld.shared.v2.f32 {f249, f250}, [r34+8192];
ld.shared.v2.f32 {f253, f254}, [r34+16384];
ld.shared.v2.f32 {f257, f258}, [r34+24576];
add.f32 f261, f245, f253;
add.f32 f262, f246, f254;
sub.f32 f263, f245, f253;
sub.f32 f264, f246, f254;
add.f32 f265, f249, f257;
add.f32 f266, f250, f258;
sub.f32 f267, f249, f257;
sub.f32 f268, f250, f258;
sub.f32 f269, f261, f265;
sub.f32 f270, f262, f266;
sub.f32 f271, f263, f268;
add.f32 f272, f264, f267;
add.f32 f273, f263, f268;
sub.f32 f274, f264, f267;
and.b32 r35, r5, 768;
bfe.u32 r36, r5, 8, 2;
mul.wide.u32 rd15, r36, 8;
mov.u64 rd16, %13;
add.s64 rd17, rd16, rd15;
ld.global.v2.f32 {f275, f276}, [rd17];
mul.f32 f279, f272, f276;
mul.f32 f280, f271, f276;
mul.f32 f281, f275, f272;
mul.f32 f282, f275, f275;
mul.f32 f283, f276, f276;
sub.f32 f284, f282, f283;
mul.f32 f285, f276, f275;
fma.rn.f32 f286, f276, f275, f285;
mul.f32 f287, f270, f286;
mul.f32 f288, f269, f286;
mul.f32 f289, f284, f270;
mul.f32 f290, f275, f284;
mul.f32 f291, f276, f286;
sub.f32 f292, f290, f291;
mul.f32 f293, f275, f286;
fma.rn.f32 f294, f276, f284, f293;
mul.f32 f295, f274, f294;
mul.f32 f296, f273, f294;
mul.f32 f297, f292, f274;
and.b32 r37, r10, 2040;
add.s32 r38, r9, r37;
barrier.sync 0;
and.b32 r39, r7, 24576;
add.s32 r40, r38, r39;
add.f32 f298, f262, f266;
add.f32 f299, f261, f265;
st.shared.v2.f32 [r40], {f299, f298};
fma.rn.f32 f300, f275, f271, f279;
sub.f32 f301, f281, f280;
st.shared.v2.f32 [r40+2048], {f300, f301};
fma.rn.f32 f302, f284, f269, f287;
sub.f32 f303, f289, f288;
st.shared.v2.f32 [r40+4096], {f302, f303};
sub.f32 f304, f297, f296;
fma.rn.f32 f305, f292, f273, f295;
st.shared.v2.f32 [r40+6144], {f305, f304};
barrier.sync 0;
mad.lo.s32 r41, r35, -24, r40;
ld.shared.v2.f32 {f306, f307}, [r41];
ld.shared.v2.f32 {f310, f311}, [r41+8192];
ld.shared.v2.f32 {f314, f315}, [r41+16384];
ld.shared.v2.f32 {f318, f319}, [r41+24576];
add.f32 f322, f306, f314;
add.f32 f323, f307, f315;
sub.f32 f324, f306, f314;
sub.f32 f325, f307, f315;
add.f32 f326, f310, f318;
add.f32 f327, f311, f319;
sub.f32 f328, f310, f318;
sub.f32 f329, f311, f319;
add.f32 %1, f323, f327;
add.f32 %0, f322, f326;
add.f32 %3, f325, f328;
sub.f32 %2, f324, f329;
sub.f32 %5, f323, f327;
sub.f32 %4, f322, f326;
sub.f32 %7, f325, f328;
add.f32 %6, f324, f329;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_4096), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<310, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<298>;
.reg .b32 r<43>;
.reg .b64 rd<18>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 14;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f17, %14, %19;
add.f32 f18, %15, %21;
sub.f32 f19, %14, %19;
sub.f32 f20, %15, %21;
add.f32 f21, %16, %22;
add.f32 f22, %18, %23;
sub.f32 f23, %16, %22;
sub.f32 f24, %18, %23;
add.f32 f25, f17, f21;
add.f32 f26, f18, f22;
sub.f32 f27, f17, f21;
sub.f32 f28, f18, f22;
sub.f32 f29, f19, f24;
add.f32 f30, f20, f23;
add.f32 f31, f19, f24;
sub.f32 f32, f20, f23;
and.b32 r6, r5, 1023;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 8184;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f33, f34}, [rd5];
mul.f32 f37, f30, f34;
fma.rn.f32 f38, f33, f29, f37;
mul.f32 f39, f29, f34;
mul.f32 f40, f33, f30;
sub.f32 f41, f40, f39;
mul.f32 f42, f33, f33;
mul.f32 f43, f34, f34;
sub.f32 f44, f42, f43;
mul.f32 f45, f34, f33;
fma.rn.f32 f46, f34, f33, f45;
mul.f32 f47, f28, f46;
fma.rn.f32 f48, f44, f27, f47;
mul.f32 f49, f27, f46;
mul.f32 f50, f44, f28;
sub.f32 f51, f50, f49;
mul.f32 f52, f33, f44;
mul.f32 f53, f34, f46;
sub.f32 f54, f52, f53;
mul.f32 f55, f33, f46;
fma.rn.f32 f56, f34, f44, f55;
mul.f32 f57, f32, f56;
fma.rn.f32 f58, f54, f31, f57;
mul.f32 f59, f31, f56;
mul.f32 f60, f54, f32;
sub.f32 f61, f60, f59;
shl.b32 r8, r5, 4;
and.b32 r9, r8, -16384;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 16368;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f25, f38, f48, f58};
barrier.sync 0;
mad.lo.s32 r13, r6, -12, r12;
ld.shared.f32 f62, [r13];
ld.shared.f32 f63, [r13+4096];
ld.shared.f32 f64, [r13+8192];
ld.shared.f32 f65, [r13+12288];
barrier.sync 0;
st.shared.v4.f32 [r12], {f26, f41, f51, f61};
barrier.sync 0;
ld.shared.f32 f66, [r13];
ld.shared.f32 f67, [r13+4096];
ld.shared.f32 f68, [r13+8192];
ld.shared.f32 f69, [r13+12288];
add.f32 f70, f62, f64;
add.f32 f71, f66, f68;
sub.f32 f72, f62, f64;
sub.f32 f73, f66, f68;
add.f32 f74, f63, f65;
add.f32 f75, f67, f69;
sub.f32 f76, f63, f65;
sub.f32 f77, f67, f69;
add.f32 f78, f70, f74;
add.f32 f79, f71, f75;
sub.f32 f80, f70, f74;
sub.f32 f81, f71, f75;
sub.f32 f82, f72, f77;
add.f32 f83, f73, f76;
add.f32 f84, f72, f77;
sub.f32 f85, f73, f76;
and.b32 r14, r5, 1020;
bfe.u32 r15, r5, 2, 8;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %10;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f86, f87}, [rd8];
mul.f32 f90, f83, f87;
fma.rn.f32 f91, f86, f82, f90;
mul.f32 f92, f82, f87;
mul.f32 f93, f86, f83;
sub.f32 f94, f93, f92;
mul.f32 f95, f86, f86;
mul.f32 f96, f87, f87;
sub.f32 f97, f95, f96;
mul.f32 f98, f87, f86;
fma.rn.f32 f99, f87, f86, f98;
mul.f32 f100, f81, f99;
fma.rn.f32 f101, f97, f80, f100;
mul.f32 f102, f80, f99;
mul.f32 f103, f97, f81;
sub.f32 f104, f103, f102;
mul.f32 f105, f86, f97;
mul.f32 f106, f87, f99;
sub.f32 f107, f105, f106;
mul.f32 f108, f86, f99;
fma.rn.f32 f109, f87, f97, f108;
mul.f32 f110, f85, f109;
fma.rn.f32 f111, f107, f84, f110;
mul.f32 f112, f84, f109;
mul.f32 f113, f107, f85;
sub.f32 f114, f113, f112;
shl.b32 r16, r5, 2;
and.b32 r17, r16, 12;
add.s32 r18, r10, r17;
barrier.sync 0;
and.b32 r19, r8, 16320;
add.s32 r20, r18, r19;
st.shared.f32 [r20], f78;
st.shared.f32 [r20+16], f91;
st.shared.f32 [r20+32], f101;
st.shared.f32 [r20+48], f111;
barrier.sync 0;
mad.lo.s32 r21, r14, -12, r20;
ld.shared.f32 f115, [r21];
ld.shared.f32 f116, [r21+4096];
ld.shared.f32 f117, [r21+8192];
ld.shared.f32 f118, [r21+12288];
barrier.sync 0;
st.shared.f32 [r20], f79;
st.shared.f32 [r20+16], f94;
st.shared.f32 [r20+32], f104;
st.shared.f32 [r20+48], f114;
barrier.sync 0;
ld.shared.f32 f119, [r21];
ld.shared.f32 f120, [r21+4096];
ld.shared.f32 f121, [r21+8192];
ld.shared.f32 f122, [r21+12288];
add.f32 f123, f115, f117;
add.f32 f124, f119, f121;
sub.f32 f125, f115, f117;
sub.f32 f126, f119, f121;
add.f32 f127, f116, f118;
add.f32 f128, f120, f122;
sub.f32 f129, f116, f118;
sub.f32 f130, f120, f122;
add.f32 f131, f123, f127;
add.f32 f132, f124, f128;
sub.f32 f133, f123, f127;
sub.f32 f134, f124, f128;
sub.f32 f135, f125, f130;
add.f32 f136, f126, f129;
add.f32 f137, f125, f130;
sub.f32 f138, f126, f129;
and.b32 r22, r5, 1008;
bfe.u32 r23, r5, 4, 6;
mul.wide.u32 rd9, r23, 8;
mov.u64 rd10, %11;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f139, f140}, [rd11];
mul.f32 f143, f136, f140;
fma.rn.f32 f144, f139, f135, f143;
mul.f32 f145, f135, f140;
mul.f32 f146, f139, f136;
sub.f32 f147, f146, f145;
mul.f32 f148, f139, f139;
mul.f32 f149, f140, f140;
sub.f32 f150, f148, f149;
mul.f32 f151, f140, f139;
fma.rn.f32 f152, f140, f139, f151;
mul.f32 f153, f134, f152;
fma.rn.f32 f154, f150, f133, f153;
mul.f32 f155, f133, f152;
mul.f32 f156, f150, f134;
sub.f32 f157, f156, f155;
mul.f32 f158, f139, f150;
mul.f32 f159, f140, f152;
sub.f32 f160, f158, f159;
mul.f32 f161, f139, f152;
fma.rn.f32 f162, f140, f150, f161;
mul.f32 f163, f138, f162;
fma.rn.f32 f164, f160, f137, f163;
mul.f32 f165, f137, f162;
mul.f32 f166, f160, f138;
sub.f32 f167, f166, f165;
and.b32 r24, r16, 60;
add.s32 r25, r10, r24;
barrier.sync 0;
and.b32 r26, r8, 16128;
add.s32 r27, r25, r26;
st.shared.f32 [r27], f131;
st.shared.f32 [r27+64], f144;
st.shared.f32 [r27+128], f154;
st.shared.f32 [r27+192], f164;
barrier.sync 0;
mad.lo.s32 r28, r22, -12, r27;
ld.shared.f32 f168, [r28];
ld.shared.f32 f169, [r28+4096];
ld.shared.f32 f170, [r28+8192];
ld.shared.f32 f171, [r28+12288];
barrier.sync 0;
st.shared.f32 [r27], f132;
st.shared.f32 [r27+64], f147;
st.shared.f32 [r27+128], f157;
st.shared.f32 [r27+192], f167;
barrier.sync 0;
ld.shared.f32 f172, [r28];
ld.shared.f32 f173, [r28+4096];
ld.shared.f32 f174, [r28+8192];
ld.shared.f32 f175, [r28+12288];
add.f32 f176, f168, f170;
add.f32 f177, f172, f174;
sub.f32 f178, f168, f170;
sub.f32 f179, f172, f174;
add.f32 f180, f169, f171;
add.f32 f181, f173, f175;
sub.f32 f182, f169, f171;
sub.f32 f183, f173, f175;
add.f32 f184, f176, f180;
add.f32 f185, f177, f181;
sub.f32 f186, f176, f180;
sub.f32 f187, f177, f181;
sub.f32 f188, f178, f183;
add.f32 f189, f179, f182;
add.f32 f190, f178, f183;
sub.f32 f191, f179, f182;
and.b32 r29, r5, 960;
bfe.u32 r30, r5, 6, 4;
mul.wide.u32 rd12, r30, 8;
mov.u64 rd13, %12;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f192, f193}, [rd14];
mul.f32 f196, f189, f193;
fma.rn.f32 f197, f192, f188, f196;
mul.f32 f198, f188, f193;
mul.f32 f199, f192, f189;
sub.f32 f200, f199, f198;
mul.f32 f201, f192, f192;
mul.f32 f202, f193, f193;
sub.f32 f203, f201, f202;
mul.f32 f204, f193, f192;
fma.rn.f32 f205, f193, f192, f204;
mul.f32 f206, f187, f205;
fma.rn.f32 f207, f203, f186, f206;
mul.f32 f208, f186, f205;
mul.f32 f209, f203, f187;
sub.f32 f210, f209, f208;
mul.f32 f211, f192, f203;
mul.f32 f212, f193, f205;
sub.f32 f213, f211, f212;
mul.f32 f214, f192, f205;
fma.rn.f32 f215, f193, f203, f214;
mul.f32 f216, f191, f215;
fma.rn.f32 f217, f213, f190, f216;
mul.f32 f218, f190, f215;
mul.f32 f219, f213, f191;
sub.f32 f220, f219, f218;
and.b32 r31, r16, 252;
add.s32 r32, r10, r31;
barrier.sync 0;
and.b32 r33, r8, 15360;
add.s32 r34, r32, r33;
st.shared.f32 [r34], f184;
st.shared.f32 [r34+256], f197;
st.shared.f32 [r34+512], f207;
st.shared.f32 [r34+768], f217;
barrier.sync 0;
mad.lo.s32 r35, r29, -12, r34;
ld.shared.f32 f221, [r35];
ld.shared.f32 f222, [r35+4096];
ld.shared.f32 f223, [r35+8192];
ld.shared.f32 f224, [r35+12288];
barrier.sync 0;
st.shared.f32 [r34], f185;
st.shared.f32 [r34+256], f200;
st.shared.f32 [r34+512], f210;
st.shared.f32 [r34+768], f220;
barrier.sync 0;
ld.shared.f32 f225, [r35];
ld.shared.f32 f226, [r35+4096];
ld.shared.f32 f227, [r35+8192];
ld.shared.f32 f228, [r35+12288];
add.f32 f229, f221, f223;
add.f32 f230, f225, f227;
sub.f32 f231, f221, f223;
sub.f32 f232, f225, f227;
add.f32 f233, f222, f224;
add.f32 f234, f226, f228;
sub.f32 f235, f222, f224;
sub.f32 f236, f226, f228;
add.f32 f237, f229, f233;
add.f32 f238, f230, f234;
sub.f32 f239, f229, f233;
sub.f32 f240, f230, f234;
sub.f32 f241, f231, f236;
add.f32 f242, f232, f235;
add.f32 f243, f231, f236;
sub.f32 f244, f232, f235;
and.b32 r36, r5, 768;
bfe.u32 r37, r5, 8, 2;
mul.wide.u32 rd15, r37, 8;
mov.u64 rd16, %13;
add.s64 rd17, rd16, rd15;
ld.global.v2.f32 {f245, f246}, [rd17];
mul.f32 f249, f242, f246;
fma.rn.f32 f250, f245, f241, f249;
mul.f32 f251, f241, f246;
mul.f32 f252, f245, f242;
sub.f32 f253, f252, f251;
mul.f32 f254, f245, f245;
mul.f32 f255, f246, f246;
sub.f32 f256, f254, f255;
mul.f32 f257, f246, f245;
fma.rn.f32 f258, f246, f245, f257;
mul.f32 f259, f240, f258;
fma.rn.f32 f260, f256, f239, f259;
mul.f32 f261, f239, f258;
mul.f32 f262, f256, f240;
sub.f32 f263, f262, f261;
mul.f32 f264, f245, f256;
mul.f32 f265, f246, f258;
sub.f32 f266, f264, f265;
mul.f32 f267, f245, f258;
fma.rn.f32 f268, f246, f256, f267;
mul.f32 f269, f244, f268;
fma.rn.f32 f270, f266, f243, f269;
mul.f32 f271, f243, f268;
mul.f32 f272, f266, f244;
sub.f32 f273, f272, f271;
and.b32 r38, r16, 1020;
add.s32 r39, r10, r38;
barrier.sync 0;
and.b32 r40, r8, 12288;
add.s32 r41, r39, r40;
st.shared.f32 [r41], f237;
st.shared.f32 [r41+1024], f250;
st.shared.f32 [r41+2048], f260;
st.shared.f32 [r41+3072], f270;
barrier.sync 0;
mad.lo.s32 r42, r36, -12, r41;
ld.shared.f32 f274, [r42];
ld.shared.f32 f275, [r42+4096];
ld.shared.f32 f276, [r42+8192];
ld.shared.f32 f277, [r42+12288];
barrier.sync 0;
st.shared.f32 [r41], f238;
st.shared.f32 [r41+1024], f253;
st.shared.f32 [r41+2048], f263;
st.shared.f32 [r41+3072], f273;
barrier.sync 0;
ld.shared.f32 f278, [r42];
ld.shared.f32 f279, [r42+4096];
ld.shared.f32 f280, [r42+8192];
ld.shared.f32 f281, [r42+12288];
add.f32 f282, f274, f276;
add.f32 f283, f278, f280;
sub.f32 f284, f274, f276;
sub.f32 f285, f278, f280;
add.f32 f286, f275, f277;
add.f32 f287, f279, f281;
sub.f32 f288, f275, f277;
sub.f32 f289, f279, f281;
add.f32 %0, f282, f286;
add.f32 %1, f283, f287;
add.f32 %3, f285, f288;
sub.f32 %2, f284, f289;
sub.f32 %4, f282, f286;
sub.f32 %5, f283, f287;
sub.f32 %7, f285, f288;
add.f32 %6, f284, f289;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_4096), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y));
};


#endif
